In [26]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import spacy
import numpy as np

In [39]:
nlp_spacy = spacy.load("en_core_web_md")

In [41]:
input_df = pd.read_csv("../data/processed_reddit_data/cleaned_reddit_12-21_to_1115.csv")

In [42]:
# Doing NER on posts with word count greater than 4
input_df = input_df[input_df['post_text_count']>4]
input_df.head()

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count
0,sandiego,going to visit san diego next week any places...,x4nzh2,Fearmkultra,2022-09-03 06:57:58+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,going to visit san diego next week any places ...,12
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57
5,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666


In [58]:
print("Unique post ID Count: ",len(pd.unique(input_df['post_id'])))
print("Dataset size:",len(input_df))

Unique post ID Count:  43273
Dataset size: 43421


In [43]:
# This is exploration of all the entities present
%%time
ent_dict = []
for index, row in input_df.iterrows():
    #print(post)
    post_id = row['post_id']
    post = row['post_text']
    doc_spacy = nlp_spacy(post)
    for ent in doc_spacy.ents:
        ent_dict.append([post_id,ent.label_,ent.text])
        
#print(ent_dict)   

CPU times: user 12min 19s, sys: 44.3 s, total: 13min 3s
Wall time: 13min 27s


In [55]:
# Extracting the labels
final_lst = [i[1] for i in ent_dict]

# Unique list of Entity labels present in our reddit posts
final_lst = list(np.unique(final_lst))

# Adding description to those labels
final_lst = [[label,spacy.explain(label)] for label in final_lst]
final_lst

[['CARDINAL', 'Numerals that do not fall under another type'],
 ['DATE', 'Absolute or relative dates or periods'],
 ['EVENT', 'Named hurricanes, battles, wars, sports events, etc.'],
 ['FAC', 'Buildings, airports, highways, bridges, etc.'],
 ['GPE', 'Countries, cities, states'],
 ['LANGUAGE', 'Any named language'],
 ['LAW', 'Named documents made into laws.'],
 ['LOC', 'Non-GPE locations, mountain ranges, bodies of water'],
 ['MONEY', 'Monetary values, including unit'],
 ['NORP', 'Nationalities or religious or political groups'],
 ['ORDINAL', '"first", "second", etc.'],
 ['ORG', 'Companies, agencies, institutions, etc.'],
 ['PERCENT', 'Percentage, including "%"'],
 ['PERSON', 'People, including fictional'],
 ['PRODUCT', 'Objects, vehicles, foods, etc. (not services)'],
 ['QUANTITY', 'Measurements, as of weight or distance'],
 ['TIME', 'Times smaller than a day'],
 ['WORK_OF_ART', 'Titles of books, songs, etc.']]

In [120]:
# These are the significant Entity labels we want to check.
ent_labels = ['DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'NORP', 'ORG', 'PERSON', 'TIME']

#Adding Entity labels as columns in our input dataframe
# for lbl in ent_label_dict:
#     input_df[lbl] = None

In [122]:
%%time
# Let's add the list in a column
# Important Labels:
# DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, NORP, ORG, PERSON, TIME

for index, row in input_df.iterrows():
    #print("index",index)
    ent_label_dict = dict.fromkeys(ent_labels,[])
    post = row['post_text']
    doc_spacy = nlp_spacy(post)
    for ent in doc_spacy.ents:
        if ent.label_ in ent_label_dict:
            ent_label_dict[ent.label_] = ent_label_dict[ent.label_] + [ent.text]
        
            # Updating the label value recognized in the dataframe
            input_df.at[index,ent.label_] = ent_label_dict[ent.label_]

CPU times: user 12min 10s, sys: 46.5 s, total: 12min 56s
Wall time: 13min 22s


In [123]:
input_df.head()

Unnamed: 0,subreddit,title,post_id,post_author,post_utc,full_link,post_text,post_text_count,ORG,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,NORP,PERSON,TIME
0,sandiego,going to visit san diego next week any places...,x4nzh2,Fearmkultra,2022-09-03 06:57:58+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,going to visit san diego next week any places ...,12,[san diego],[next week],,,,,,,,,
2,sandiego,whaley house picture of ghost,x4ntm7,Open_Construction_31,2022-09-03 06:47:09+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,whaley house picture of ghost as a kid i saw t...,199,"[whaley house, the whaley house]","[13, 25 yrs ago]",,,[san diegans],,,,,,"[a minute later, late nightearly morning]"
3,sandiego,language exchange,x4n6xv,Poshorock,2022-09-03 06:07:46+00:00,https://www.reddit.com/r/sandiego/comments/x4n...,language exchange is there someone by there wh...,31,,,,,,[english],,,[spanish],[san diego],
4,SanDiegan,chula vista police stopping cars going east on...,x4n5aj,kaptaincorn,2022-09-03 06:04:54+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,chula vista police stopping cars going east on...,57,,,,,[chula vista],,,,,,
5,SanDiegan,todd gloria finalizes plan to change park blvd...,x4n2rv,Lemonade_IceCold,2022-09-03 06:00:38+00:00,https://www.reddit.com/r/SanDiegan/comments/x4...,todd gloria finalizes plan to change park blvd...,666,[gtonly],,,[balboa park],,,,,[north american],"[todd gloria, kevin]",


In [127]:
# Giving a check on Non-None count for all columns.
input_df.count()

subreddit          43421
title              43420
post_id            43421
post_author        43421
post_utc           43421
full_link          43421
post_text          43421
post_text_count    43421
ORG                10637
DATE               17438
EVENT                109
FAC                 1187
GPE                11175
LANGUAGE             200
LAW                  376
LOC                 1649
NORP                2731
PERSON              9474
TIME                5928
dtype: int64

In [128]:
input_df.to_csv("../data/processed_reddit_data/cleaned_reddit_ner_12-21_to_1115.csv",header=True)

In [132]:
input_df['ORG'].dropna().head(20)

0                                  [san diego]
2             [whaley house, the whaley house]
5                                     [gtonly]
11                              [paseo, paseo]
14                              [honda, honda]
28       [rita atkinson apartments minidouble]
29                                [ac, ac, ac]
31    [uc scam diego, uc scam diego shirt lmk]
33                                    [toyota]
34                            [house of blues]
42                                    [triton]
59                      [the san diego humane]
63           [the san diego police department]
65                                      [sims]
67                                        [ez]
68                                [awp 3, tas]
74                          [szn massive heat]
75                                    [ge, ge]
76                                    [ge, ge]
81                                       [gta]
Name: ORG, dtype: object

In [133]:
input_df['GPE'].dropna().head(20)

2                                         [san diegans]
4                                         [chula vista]
9                           [servercocktail, san diego]
12                                          [san diego]
13        [san diego twohearts, connecticut, san diego]
14                                              [texas]
17                                         [california]
21                                  [san diego skyline]
33                                   [broadway st, y.o]
36                                         [clairemont]
37                                         [clairemont]
41                                          [san diego]
43                                          [san diego]
55                                          [san diego]
57                                          [san diego]
59    [virginia, san diego, san diego, san diego, sa...
61                                             [roymar]
80                                             [

In [134]:
input_df['NORP'].dropna().head(20)

3                                        [spanish]
5                                 [north american]
80                                [philadelphians]
100                                       [indian]
107                                        [dutch]
108    [malaysians, kawankawans, southeast asians]
142                                      [mexican]
146                                 [californians]
162      [indian, indian, uc san diegos, american]
166                                      [british]
174                          [san diegans, afghan]
177                                      [mexican]
193                                [german, irish]
217                             [english, english]
230              [taiwanese, taiwanese, taiwanese]
269                                      [mexican]
283                                      [mexican]
284                             [japanese, japans]
302                               [julian, julian]
341                            

In [135]:
input_df['DATE'].dropna().head(20)

0                                           [next week]
2                                      [13, 25 yrs ago]
11         [3 days ago, today, today, the same week, 9]
13                 [one year anniversary, last october]
15    [this year last year, the upcoming year, this ...
21                                         [night 2015]
22                                       [fall quarter]
27    [homecoming weekend, the start of 3rd quarter,...
28                               [jul 27, sep 5, sep 5]
29                                        [last summer]
30                                       [this weekend]
33                                               [2019]
35                                             [2 days]
41    [2 day, halloween, october, two day, october, ...
42                          [the start of fall quarter]
45                                                [80s]
47                                      [this semester]
51                             [every week, the 