In [1]:
import pandas as pd
import string
import nltk
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

df = pd.read_json('data/controversial-comments.jsonl',lines=True)
df['txt'] = df['txt'].str.lower()
df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adoni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,con,txt
0,0,well it's great that he did something about th...
1,0,you are right mr. president.
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...
...,...,...
949995,0,i genuinely can't understand how anyone can su...
949996,0,"as a reminder, this subreddit [is for civil di..."
949997,0,k. don't explain why or anything.
949998,0,[deleted]


In [2]:
def remove_punctuation(txt):
    for punctuation in string.punctuation:
        txt = txt.replace(punctuation, '')
    return txt
df['txt'] = df['txt'].apply(remove_punctuation)
df

Unnamed: 0,con,txt
0,0,well its great that he did something about tho...
1,0,you are right mr president
2,0,you have given no input apart from saying i am...
3,0,i get the frustration but the reason they want...
4,0,i am far from an expert on tpp and i would ten...
...,...,...
949995,0,i genuinely cant understand how anyone can sup...
949996,0,as a reminder this subreddit is for civil disc...
949997,0,k dont explain why or anything
949998,0,deleted


In [3]:
df['txt'] = df['txt'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df

Unnamed: 0,con,txt
0,0,well great something beliefs office doubt trum...
1,0,right mr president
2,0,given input apart saying wrong argument clearly
3,0,get frustration reason want way foundation com...
4,0,far expert tpp would tend agree lot problems u...
...,...,...
949995,0,genuinely cant understand anyone support point...
949996,0,reminder subreddit civil discussionhttpswwwred...
949997,0,k dont explain anything
949998,0,deleted


In [4]:
ps = PorterStemmer()

In [5]:
def stem_sentences(sentence):
    tokens = sentence.split()
    stemmed_tokens = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['txt'] = df['txt'].apply(stem_sentences)
df

Unnamed: 0,con,txt
0,0,well great someth belief offic doubt trump wou...
1,0,right mr presid
2,0,given input apart say wrong argument clearli
3,0,get frustrat reason want way foundat complex p...
4,0,far expert tpp would tend agre lot problem und...
...,...,...
949995,0,genuin cant understand anyon support point ok ...
949996,0,remind subreddit civil discussionhttpswwwreddi...
949997,0,k dont explain anyth
949998,0,delet


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
bag_of_words = count.fit_transform(df['txt'])
bag_of_words
print(bag_of_words)


  (0, 218385)	1
  (0, 71213)	1
  (0, 179925)	2
  (0, 25872)	1
  (0, 144412)	1
  (0, 53030)	1
  (0, 203167)	1
  (0, 222143)	1
  (0, 63161)	1
  (0, 208894)	1
  (0, 112005)	1
  (0, 163099)	2
  (0, 78830)	1
  (0, 143531)	1
  (0, 43495)	1
  (0, 216587)	1
  (1, 167511)	1
  (1, 136191)	1
  (1, 156520)	1
  (2, 69242)	1
  (2, 114171)	1
  (2, 19826)	1
  (2, 171986)	1
  (2, 222414)	1
  (2, 20674)	1
  :	:
  (949996, 58684)	1
  (949996, 53279)	1
  (949996, 40007)	3
  (949996, 50782)	1
  (949996, 220330)	1
  (949996, 160486)	1
  (949996, 80152)	1
  (949996, 112882)	2
  (949996, 24164)	1
  (949996, 209150)	1
  (949996, 165421)	1
  (949996, 151120)	1
  (949997, 52679)	1
  (949997, 19670)	1
  (949997, 60317)	1
  (949998, 48295)	1
  (949999, 167511)	1
  (949999, 62562)	1
  (949999, 66565)	1
  (949999, 154830)	1
  (949999, 174858)	1
  (949999, 34963)	1
  (949999, 223303)	1
  (949999, 120993)	1
  (949999, 179501)	1


In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag
from nltk import word_tokenize

def tokenize(text):
    if not text:
        text = ''
    return pos_tag(word_tokenize(text))

tagged = df.txt.apply(tokenize)


In [8]:
tagged

0         [well, great, someth, belief, offic, doubt, tr...
1                                       [right, mr, presid]
2         [given, input, apart, say, wrong, argument, cl...
3         [get, frustrat, reason, want, way, foundat, co...
4         [far, expert, tpp, would, tend, agre, lot, pro...
                                ...                        
949995    [genuin, cant, understand, anyon, support, poi...
949996    [remind, subreddit, civil, discussionhttpswwwr...
949997                            [k, dont, explain, anyth]
949998                                              [delet]
949999    [ya, sociopath, known, celebr, posit, feel, fu...
Name: txt, Length: 950000, dtype: object

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(df['txt'])
feature_matrix


<950000x226148 sparse matrix of type '<class 'numpy.float64'>'
	with 15344178 stored elements in Compressed Sparse Row format>

In [None]:
"""
The reasons we need to use these techniques is to help distinguish text on things like social media for example we can use all
of these starting with the bag of words, can be used to count the number of certain things. Let us think of Twitter and the hash
tags that are trending, this is a way we can see certain ones overall and for each user or by search. Sometimes people forget
to use the hashtag so the words in each tweet can also now be counted. Then we can tag them so we know nouns and verbs, ect like
in the english language as we did above. Lastly if we have repeating words outside of the stopwords that are across many tweets
then it is not as important as a noun like a person or place, like a big pop culture story that becomes trending. 
"""

