In [11]:
# import relevant libraries

import pandas as pd

In [12]:
# import data into 2 dataframe

df1 = pd.read_csv("/content/reddit_posts.csv", low_memory=False)

In [13]:
# make a subset of df1 to only incluse -

df2 = df1[["author", "created_utc", "selftext", "title"]]

In [14]:
# Individual columns and shape of dataframe

print(df2.columns)
print(df2.shape)

Index(['author', 'created_utc', 'selftext', 'title'], dtype='object')
(97115, 4)


In [15]:
# print info about the data

print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97115 entries, 0 to 97114
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   author       97115 non-null  object
 1   created_utc  97115 non-null  int64 
 2   selftext     53271 non-null  object
 3   title        97115 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.0+ MB
None


In [18]:
# Identify and handle non-numeric values
df2["created_utc"] = pd.to_numeric(df2["created_utc"], errors = "coerce")

# Drop rows with NaN values in 'utc_timestamp'
df2 = df2.dropna(subset = ["created_utc"])

# Convert the cleaned 'created_utc' column to datetime
df2["readable_date"] = pd.to_datetime(df2["created_utc"], unit = "s")

In [25]:
# Print the first few rows to check the result

df2.head()

Unnamed: 0,author,selftext,title,year,month,day,hour,minute
0,Wriiight,,"Hey, Arlington, leave those kids alone! (nyaaaaa)",2008,6,5,17,21
1,Wriiight,,DC pol on a vendetta against park benches,2008,6,6,22,5
2,falseprophet,,Supreme Court Strikes Down 32-year-old ban on ...,2008,6,26,15,34
3,falseprophet,,"Students, Rec Centers, and Libraries in D.C. w...",2008,6,27,10,34
4,falseprophet,,After overturning the D.C. handgun ban [comic],2008,6,27,12,20


In [24]:
# dropping column "created_utc"

df2.drop(columns = ["created_utc"], inplace=True)

KeyError: "['created_utc'] not found in axis"

In [26]:
# Print the first few rows to check the result

df2.head()

Unnamed: 0,author,selftext,title,year,month,day,hour,minute
0,Wriiight,,"Hey, Arlington, leave those kids alone! (nyaaaaa)",2008,6,5,17,21
1,Wriiight,,DC pol on a vendetta against park benches,2008,6,6,22,5
2,falseprophet,,Supreme Court Strikes Down 32-year-old ban on ...,2008,6,26,15,34
3,falseprophet,,"Students, Rec Centers, and Libraries in D.C. w...",2008,6,27,10,34
4,falseprophet,,After overturning the D.C. handgun ban [comic],2008,6,27,12,20


In [27]:
# Splitting readable_date into year, month, day, hour and minute

df2["year"] = df2["readable_date"].dt.year
df2["month"] = df2["readable_date"].dt.month
df2["day"] = df2["readable_date"].dt.day
df2["hour"] = df2["readable_date"].dt.hour
df2["minute"] = df2["readable_date"].dt.minute

KeyError: 'readable_date'

In [28]:
# dropping column "readable_date"

df2.drop(columns = ["readable_date"], inplace=True)

KeyError: "['readable_date'] not found in axis"

In [29]:
# Print the first few rows to check the result

print(df2.head())

         author selftext                                              title  \
0      Wriiight      NaN  Hey, Arlington, leave those kids alone! (nyaaaaa)   
1      Wriiight      NaN          DC pol on a vendetta against park benches   
2  falseprophet      NaN  Supreme Court Strikes Down 32-year-old ban on ...   
3  falseprophet      NaN  Students, Rec Centers, and Libraries in D.C. w...   
4  falseprophet      NaN     After overturning the D.C. handgun ban [comic]   

   year  month  day  hour  minute  
0  2008      6    5    17      21  
1  2008      6    6    22       5  
2  2008      6   26    15      34  
3  2008      6   27    10      34  
4  2008      6   27    12      20  


In [31]:
# Rearrange the DataFrame

new_order = ["year", "month", "day", "hour", "minute", "author", "title", "selftext"]
df2 = df2[new_order]

In [32]:
# Print the first few rows to check the result

df2.head()

Unnamed: 0,year,month,day,hour,minute,author,title,selftext
0,2008,6,5,17,21,Wriiight,"Hey, Arlington, leave those kids alone! (nyaaaaa)",
1,2008,6,6,22,5,Wriiight,DC pol on a vendetta against park benches,
2,2008,6,26,15,34,falseprophet,Supreme Court Strikes Down 32-year-old ban on ...,
3,2008,6,27,10,34,falseprophet,"Students, Rec Centers, and Libraries in D.C. w...",
4,2008,6,27,12,20,falseprophet,After overturning the D.C. handgun ban [comic],


In [33]:
# Combining data in 2 rows into one (Combined Text = Title + Selftext)

df2["Combined_Text"] = df2["title"].astype(str) + ' ' + df2["selftext"].astype(str)

In [34]:
# Dictionary of keywords to filter posts

keywords = {
    "crime", "criminal", "theft", "burglary", "robbery",
    "assault", "murder", "homicide", "arson", "vandalism",
    "fraud", "kidnapping", "domestic violence", "gun violence",
    "drug trafficking", "gang", "traffic violation", "misdemeanor",
    "felony", "suspicious activity", "victim", "witness",
    "court", "trial", "prosecution", "defense", "sentence",
    "arrest", "conviction", "penalty", "law enforcement",
    "police", "investigation", "evidence", "crime scene",
    "criminal justice", "penitentiary", "rehabilitation",
    "recidivism", "public safety", "violence", "assault",
    "break-in", "home invasion", "white collar crime",
    "cybercrime", "sexual assault", "hate crime", "juvenile delinquency",
    "cyberbullying", "child abuse", "exploitation", "extortion",
    "fraudulent activity", "counterfeiting", "insider trading",
    "terrorism", "hate speech", "organized crime", "bribery",
    "smuggling", "identity theft", "stalking", "domestic abuse",
    "assault with a deadly weapon", "sex trafficking", "human trafficking",
    "petty crime", "drug abuse", "cyberstalking", "restitution",
    "disorderly conduct", "false arrest", "violation of probation",
    "trespassing", "vehicular manslaughter", "child pornography",
    "public intoxication", "gang violence", "weapons charge",
    "underage drinking", "operating while intoxicated (OWI)",
    "search warrant", "Miranda rights", "plea bargain",
    "clemency", "subpoena", "witness intimidation",
    "domestic terrorism", "criminal conspiracy", "manslaughter",
    "racketeering", "drug possession", "prostitution",
    "illegal immigration", "forgery", "torture", "child neglect",
    "animal cruelty", "corporate crime", "arson for profit",
    "extreme risk protection order", "stolen property",
    "conspiracy", "unlawful assembly", "coercion"
}

In [35]:
# Create a boolean mask for matching the keywords in either column 1 or column 2
mask = df2["Combined_Text"].str.contains("|".join(keywords), case=False, na=False)

# Create a subset of the DataFrame using the mask
subset_df = df2[mask]

# Print the first few rows of the subset DataFrame
print(subset_df.head())

  mask = df2['Combined_Text'].str.contains('|'.join(keywords), case=False, na=False)


     year  month  day  hour  minute          author  \
2    2008      6   26    15      34    falseprophet   
22   2009      8   21    15      21       [deleted]   
58   2009     11   19    23      47  yellowcakewalk   
62   2009     11   25     2      26           ckcin   
101  2010      2    4    16      25       [deleted]   

                                                 title  \
2    Supreme Court Strikes Down 32-year-old ban on ...   
22   Are You Safe app tells you how many homicides ...   
58   Anyone else in on this class action suit again...   
62   WTU Loses Court Challenge To Layoffs - D.C. Wi...   
101  Hey, Reddit! Who wants to be a snomad with me ...   

                                              selftext  \
2                                                  NaN   
22                                                 NaN   
58                                                 NaN   
62                                                 NaN   
101  According to the [latest 

In [36]:
subset_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5545 entries, 2 to 97098
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   year           5545 non-null   int32 
 1   month          5545 non-null   int32 
 2   day            5545 non-null   int32 
 3   hour           5545 non-null   int32 
 4   minute         5545 non-null   int32 
 5   author         5545 non-null   object
 6   title          5545 non-null   object
 7   selftext       2826 non-null   object
 8   Combined_Text  5545 non-null   object
dtypes: int32(5), object(4)
memory usage: 324.9+ KB


In [37]:
subset_df.to_csv("nlp_data.csv", index=False)