In [6]:
import pandas as pd
import ast  # To safely evaluate the string as a Python list

# Load your dataset (Assume it's a CSV file, adjust path as needed)
df = pd.read_csv('reddit_data_with_comments.csv')

# A function to split the comment data
def separate_comments(row):
    # Parse the 'Comments Data' which is in string format
    try:
        comments_data = ast.literal_eval(row['Comments Data'])  # Safely convert string to list
        comment_rows = []
        
        # Iterate through each comment in the list
        for comment in comments_data:
            comment_text = comment[0]  # First element is the comment text
            comment_upvotes = comment[1]  # Second element is the upvotes
            comment_timestamp = comment[2]  # Third element is the timestamp
            
            # Create a new row for each comment
            comment_row = {
                'Title': row['Title'],
                'Text': row['Text'] if pd.notnull(row['Text']) else None,
                'Upvotes': row['Upvotes'],
                'Comment Text': comment_text,
                'Comment Upvotes': comment_upvotes,
                'Comment Timestamp': comment_timestamp,
                'Subreddit': row['Subreddit'],
                'Original Timestamp': row['Timestamp']
            }
            comment_rows.append(comment_row)
        return comment_rows
    except (ValueError, SyntaxError):
        # Handle cases where Comments Data is malformed
        return []

# Create a list to store all individual comments
all_comments = []

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    all_comments.extend(separate_comments(row))

# Convert the list of comments into a new DataFrame
new_df = pd.DataFrame(all_comments)

# Save the new DataFrame to a CSV (or you can process it further)
new_df.to_csv('separated_comments.csv', index=False)

# Show the first few rows of the new DataFrame
print(new_df.head())


                                               Title  Text  Upvotes  \
0  Are you ok with the DOD removing articles from...  None     7614   
1  Are you ok with the DOD removing articles from...  None     7614   
2  Are you ok with the DOD removing articles from...  None     7614   
3  Are you ok with the DOD removing articles from...  None     7614   
4  Are you ok with the DOD removing articles from...  None     7614   

                                        Comment Text  Comment Upvotes  \
0            Navajo code talkers are still not back.             2571   
1  I’m just thinking about how these are the same...             5572   
2  Anyone who is okay with this does not respect ...             1696   
3  It’s not DEI, but racism when you remove MOH p...              579   
4  You should just call it what it is. The curren...              796   

   Comment Timestamp  Subreddit  Original Timestamp  
0       1.742301e+09  AskReddit        1.742300e+09  
1       1.742300e+09  AskR

In [8]:
import pandas as pd
import ast  # To safely evaluate the string as a Python list

# Load your dataset (Assume it's a CSV file, adjust path as needed)
df = pd.read_csv('reddit_data_with_comments.csv')

# A function to split the comment data
def separate_comments(row):
    # Parse the 'Comments Data' which is in string format
    try:
        comments_data = ast.literal_eval(row['Comments Data'])  # Safely convert string to list
        comment_rows = []
        
        # Iterate through each comment in the list
        for comment in comments_data:
            comment_text = comment[0]  # First element is the comment text
            comment_upvotes = comment[1]  # Second element is the upvotes
            comment_timestamp = comment[2]  # Third element is the timestamp
            
            # Combine the Title, Text, and Comment Text into one column 'Text'
            combined_text = f"Title: {row['Title']}\nText: {row['Text'] if pd.notnull(row['Text']) else ''}\nComment: {comment_text}"
            
            # Create a new row for each comment
            comment_row = {
                'Text': combined_text,
                'Upvotes': row['Upvotes'],
                'Comment Upvotes': comment_upvotes,
                'Comment Timestamp': comment_timestamp,
                'Subreddit': row['Subreddit'],
                'Original Timestamp': row['Timestamp']
            }
            comment_rows.append(comment_row)
        return comment_rows
    except (ValueError, SyntaxError):
        # Handle cases where Comments Data is malformed
        return []

# Create a list to store all individual comments
all_comments = []

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    all_comments.extend(separate_comments(row))

# Convert the list of comments into a new DataFrame
new_df = pd.DataFrame(all_comments)

# Save the new DataFrame to a CSV (or you can process it further)
new_df.to_csv('separated_comments_combined.csv', index=False)

# Show the first few rows of the new DataFrame
print(new_df.head())


                                                Text  Upvotes  \
0  Title: Are you ok with the DOD removing articl...     7614   
1  Title: Are you ok with the DOD removing articl...     7614   
2  Title: Are you ok with the DOD removing articl...     7614   
3  Title: Are you ok with the DOD removing articl...     7614   
4  Title: Are you ok with the DOD removing articl...     7614   

   Comment Upvotes  Comment Timestamp  Subreddit  Original Timestamp  
0             2571       1.742301e+09  AskReddit        1.742300e+09  
1             5572       1.742300e+09  AskReddit        1.742300e+09  
2             1696       1.742301e+09  AskReddit        1.742300e+09  
3              579       1.742302e+09  AskReddit        1.742300e+09  
4              796       1.742300e+09  AskReddit        1.742300e+09  


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv('separated_comments_combined.csv')

# Randomly select 30 rows from the dataset without fixing the random state
sample_df = df.sample(n=30)

# Display the selected columns for classification
sample_df = sample_df[['Text', 'Upvotes', 'Comment Upvotes', 'Comment Timestamp', 'Subreddit', 'Original Timestamp']]

# Display the selected rows
print(sample_df)

# Optionally, save the selected rows to a CSV file for easy access
sample_df.to_csv('sample_for_classification.csv', index=False)


                                                    Text  Upvotes  \
60136  Title: Atletico Madrid 2 - [3] Barcelona - Lam...     5384   
32015  Title: JUST EAT THE CRUST IT'S JUST BREAD\nTex...     1753   
28769  Title: will be less talkative from now on.\nTe...     1330   
46419  Title: What is up with Casey Anthony popping u...      672   
7464   Title: Redditors, how do you feel a grassroots...     9585   
15303  Title: what was the first wine you tried that ...       17   
78323  Title: What are the best online PvP games to g...      328   
53134  Title: [DR] Christian Eriksen hasn't received ...      532   
78534  Title: Do current-gen consoles feel outdated e...        0   
63652  Title: Dujon Sterling calling Celtic Park a Sh...      182   
27494  Title: This man had kids for one reason.\nText...    79825   
74931  Title: On March 15th, 2069 years ago, Assassin...     7974   
40681  Title: pee pee shaped cheeto\nText: \nComment:...        1   
6792   Title: You are forced to bu

In [15]:
import pandas as pd

# Load your dataset
df = pd.read_csv('sample_for_classification.csv')

# Convert 'Original Timestamp' and 'Comment Timestamp' to datetime
df["Timestamp"] = pd.to_datetime(df["Original Timestamp"], unit="s")  # Post date
df["Comment Timestamp"] = pd.to_datetime(df["Comment Timestamp"], unit="s")  # Comment date

# Extract only the date for both post and comment timestamps
df["Date"] = df["Timestamp"].dt.date  # Post date
df["Comment Date"] = df["Comment Timestamp"].dt.date  # Comment date

# Display the selected columns for classification
df = df[['Text', 'Upvotes', 'Comment Upvotes', 'Comment Date', 'Subreddit', 'Date']]

# Display the selected rows
print(df)

# Optionally, save the selected rows to a CSV file for easy access
df.to_csv('sample_for_classification_FixedDate.csv', index=False)


                                                 Text  Upvotes  \
0   Title: Atletico Madrid 2 - [3] Barcelona - Lam...     5384   
1   Title: JUST EAT THE CRUST IT'S JUST BREAD\nTex...     1753   
2   Title: will be less talkative from now on.\nTe...     1330   
3   Title: What is up with Casey Anthony popping u...      672   
4   Title: Redditors, how do you feel a grassroots...     9585   
5   Title: what was the first wine you tried that ...       17   
6   Title: What are the best online PvP games to g...      328   
7   Title: [DR] Christian Eriksen hasn't received ...      532   
8   Title: Do current-gen consoles feel outdated e...        0   
9   Title: Dujon Sterling calling Celtic Park a Sh...      182   
10  Title: This man had kids for one reason.\nText...    79825   
11  Title: On March 15th, 2069 years ago, Assassin...     7974   
12  Title: pee pee shaped cheeto\nText: \nComment:...        1   
13  Title: You are forced to buy one thing everyda...       78   
14  Title: