In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
import os
os.getcwd()

'/Users/annamitchell/Desktop/GA_Docs/Projects/NLP_Project_4/notebooks'

In [3]:
df = pd.read_csv('/Users/annamitchell/Desktop/GA_Docs/Projects/NLP_Project_4/datasets/comments.csv')

In [4]:
df.shape

(11261, 2)

In [5]:
df.head()

Unnamed: 0,body,subreddit
0,They shared.\nThey did not share.\nThey resist...,elonmusk
1,"Sorry to break it to you, but corporate monopo...",elonmusk
2,And wearing his Elon's Musk,elonmusk
3,This is where it starts getting a little weird,elonmusk
4,[removed],elonmusk


In [6]:
df['subreddit'].value_counts()

Futurology    5700
elonmusk      5561
Name: subreddit, dtype: int64

In [7]:
# Drop NAs.
df.dropna(inplace=True)

### Label target value

In [8]:
# Target is 'subreddit'. Current values: 'elon', 'futurolgy'.
# Need to transform to 0 or 1 values. Set elonmusk = 1, futurology = 0

df['target'] = df['subreddit'].map({'elonmusk': 1, 'Futurology': 0})
df.drop('subreddit', axis=1, inplace=True)
df.head()


Unnamed: 0,body,target
0,They shared.\nThey did not share.\nThey resist...,1
1,"Sorry to break it to you, but corporate monopo...",1
2,And wearing his Elon's Musk,1
3,This is where it starts getting a little weird,1
4,[removed],1


In [9]:
df['target'].value_counts()

0    5700
1    5561
Name: target, dtype: int64

In [10]:
#duplicates??

df.duplicated().sum()

7824

In [11]:
df[df.duplicated()].head()

Unnamed: 0,body,target
9,[removed],1
36,[removed],1
38,[removed],1
39,[removed],1
54,[removed],1


In [12]:
df[df.duplicated()].tail()

Unnamed: 0,body,target
11254,And you are going to find out soon that it was...,0
11255,No. Lithium sucks. Lithium makes you emotional...,0
11256,"It cannot write books, loses coherence after a...",0
11257,How so? 3000 feet is perfectly survivable and...,0
11259,[removed],0


In [13]:
dup_list = list(df[df.duplicated()]['body'])[0]
dup_list

'[removed]'

In [14]:
# number of duplicate 
len(df[df['body'] == dup_list])

1166

In [15]:
dup_list2 = list(df[df.duplicated()]['body'])[-1]
dup_list2

'[removed]'

In [16]:
# number of duplicate dup_list2 posts
len(df[df['body'] == dup_list2])

1166

In [17]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != dup_list) & (df['body'] != dup_list2)].head(10)

Unnamed: 0,body,target
77,[deleted],1
221,[deleted],1
364,[deleted],1
447,[deleted],1
560,LMAO,1
611,[deleted],1
630,[deleted],1
638,Nice.,1
639,Nice.,1
640,Nice.,1


In [18]:
# look at other duplicate posts
df[df.duplicated() & (df['body'] != dup_list) & (df['body'] != dup_list2)].tail(10)

Unnamed: 0,body,target
11247,While I also do as much as I personally can (i...,0
11248,Would fuck up the flight. Imagine a floppy com...,0
11249,Having the government subsidize renewable ener...,0
11250,&gt; . . and are among the biggest donators to...,0
11251,Underrated comment lol,0
11253,You guys have got to get your typo shit together,0
11254,And you are going to find out soon that it was...,0
11255,No. Lithium sucks. Lithium makes you emotional...,0
11256,"It cannot write books, loses coherence after a...",0
11257,How so? 3000 feet is perfectly survivable and...,0


In [19]:
df.shape

(11261, 2)

In [20]:
df.drop_duplicates(inplace=True)

In [21]:
df.shape

(3437, 2)

In [22]:
df.tail()

Unnamed: 0,body,target
11158,"To be really fair, a Tesla on autopilot crashi...",0
11159,No they aren't. This is the problem with lack ...,0
11160,"Beats dying on earth, which is what awaits mos...",0
11258,No but it starts at common sense. If it is not...,0
11260,Comic book?\n\nWe basically exist in a convent...,0


In [23]:
df = df.reset_index(drop=True)
df.tail()

Unnamed: 0,body,target
3432,"To be really fair, a Tesla on autopilot crashi...",0
3433,No they aren't. This is the problem with lack ...,0
3434,"Beats dying on earth, which is what awaits mos...",0
3435,No but it starts at common sense. If it is not...,0
3436,Comic book?\n\nWe basically exist in a convent...,0


In [24]:
#help from https://towardsdatascience.com/the-real-world-as-seen-on-twitter-sentiment-analysis-part-one-5ac2d06b63fb
#https://stackoverflow.com/questions/4328500/how-can-i-strip-all-punctuation-from-a-string-in-javascript-using-regex


def cleaner(text):
    # Make lowercase
    text = text.lower()

    # Remove HTML special entities (e.g. &amp;)
    text = re.sub(r'\&\w*;', '', text)
    
    # Remove hyperlinks
    text = re.sub(r'https?:\/\/.*\/\w*', '', text)
    
    # Remove punctuation and split 's, 't, 've with a space for filter
    text = re.sub(r'[' + string.punctuation.replace('@', '') + ']+', ' ', text)
    
    # Remove words with 2 or fewer letters
    text = re.sub(r'\b\w{1,2}\b', '', text)
    
    # Remove whitespace (including new line characters)
    text = re.sub(r'\s\s+', ' ', text)
    
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    text = ''.join(c for c in text if c <= '\uFFFF') 
    
    return text

In [25]:
df['body'] = df['body'].apply(cleaner)

In [26]:
df.shape

(3437, 2)

In [27]:
 # drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [28]:
df.shape

(3427, 2)

### NLP preprocessing 

**LETS Lemmatize**

In [29]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(text):
    words = text.split()
    lemma_words = ''
    for word in words:
        lemma_words += (lemmatizer.lemmatize(word) + ' ')
    return lemma_words

In [30]:
df['body'] = df['body'].apply(lemmatize_words)

In [31]:
df.shape

(3427, 2)

In [32]:
# drop rows where body = ''
df = df[df['body'] != '']
df = df.reset_index(drop=True)

In [33]:
df.shape

(3413, 2)

#### Stopwords

In [34]:
#import spacy 
#from spacy.lang.en.stop_words import STOP_WORDS

#print(len(STOP_WORDS))
#print(STOP_WORDS)

In [35]:
#STOP_WORDS |= {"wa","ha"}

#print(len(STOP_WORDS))
#print(STOP_WORDS)

In [36]:
# add non-meaningful words from the "most frequent" lists above to the stop words dictionary
from sklearn.feature_extraction import text
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

text.ENGLISH_STOP_WORDS

add_stop_words = ['did', 'got', 'don', 'ha', 'just', 'going', 'got', 'wa', 'really', 'the']

stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

#stop_words

### CountVectorizer

In [37]:
X = df['body']
y = df['target']

y.value_counts(normalize=True)

0    0.523
1    0.477
Name: target, dtype: float64

In [38]:
# Split the data into the training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y,
                                                    random_state=42)

In [39]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2286,), (1127,), (2286,), (1127,))

#### Working with stop words and most frequent words

In [40]:
# Instantiate a CountVectorizer with the default hyperparameters.
cvec = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 35,
                             max_df=.98) 

In [41]:
# Fit the vectorizer on our corpus.
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.98, max_features=35, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [42]:
# Transform the corpus.
X_train = cvec.transform(X_train)

In [43]:
#Convert X_train into a DataFrame.

X_train_df = pd.DataFrame(X_train.toarray(),
                          columns=cvec.get_feature_names())
print(X_train_df.shape)
X_train_df.head()

(2286, 35)


Unnamed: 0,actually,car,change,company,don,elon,energy,going,good,ha,...,tesla,thing,think,time,wa,want,way,work,world,year
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,3,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
# Transform test
X_test = cvec.transform(X_test)
X_test_df = pd.DataFrame(X_test.toarray(),
                         columns=cvec.get_feature_names())

print(X_test_df.shape)
X_test_df.head()

(1127, 35)


Unnamed: 0,actually,car,change,company,don,elon,energy,going,good,ha,...,tesla,thing,think,time,wa,want,way,work,world,year
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
plt.figure(figsize = (10, 5))

# Visualize top 10 words
plt.bar(most_freq[0][:10], most_freq[1][:10]);

NameError: name 'most_freq' is not defined

<Figure size 720x360 with 0 Axes>

#### Most Frequent Futurology words:

In [None]:
STOP_WORDS = nltk.corpus.stopwords.words('english')
#STOP_WORDS.append('wa')

newStopWords = ["wa","ha"]
STOP_WORDS.extend(newStopWords)

print(len(STOP_WORDS))
print(STOP_WORDS)

In [None]:
#starting with futureology

# Using change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 35,
                             max_df=.98) 

# input for CountVectorizer is an array of strings
vector_input_future = df[df['target'] == 0]['body']

# fit_transform the vectorizer
future_words = count_vect.fit_transform(vector_input_future)

# convert output to a Numpy array
future_words_df = pd.DataFrame(future_words.toarray(),
                         columns=count_vect.get_feature_names())

In [None]:
# empty dictionary
top_words = {}

# loop through columns
for i in future_words_df.columns:
    # save sum of each column in dictionary
    top_words[i] =  future_words_df[i].sum()
    
# top_words to dataframe sorted by highest occurance
most_freq_future = pd.DataFrame(sorted(top_words.items(), key = lambda x: x[1], reverse = True))

In [None]:
most_freq_future.head()

In [None]:
plt.figure(figsize = (15, 5))

# Visualize top 10 words
plt.bar(most_freq_future[0][:20], most_freq_future[1][:20]);

In [None]:
# get the words
future_word_list = count_vect.get_feature_names()
print(future_word_list)

#### Most Frequent Elon words:

In [None]:
df['target'].value_counts()

In [None]:
#Elon most frequent words 

# Using change max_features

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = 'english',
                             max_features = 35,
                             max_df=.98) 

# input for CountVectorizer is an array of strings
vector_input_elon = df[df['target'] == 1]['body']

# fit_transform the vectorizer
elon_words = count_vect.fit_transform(vector_input_elon)

# convert output to a Numpy array
elon_words_df = pd.DataFrame(future_words2.toarray(),
                         columns=count_vect.get_feature_names())

In [None]:
# empty dictionary
top_words = {}

# loop through columns
for i in elon_words_df.columns:
    # save sum of each column in dictionary
    top_words[i] =  elon_words_df[i].sum()
    
# top_words to dataframe sorted by highest occurance
most_freq2 = pd.DataFrame(sorted(top_words.items(), key = lambda x: x[1], reverse = True))

In [None]:
# get the words
elon_list_df = count_vect.get_feature_names()
print(elon_list)

In [None]:
plt.figure(figsize = (15, 5))

# Visualize top 10 words
plt.bar(most_freq2[0][:20], most_freq2[1][:20]);

#### Word/N-gram frequency

In [None]:
# Instantiate the "CountVectorizer" object, which is scikit-learn's bag of words tool.
# CountVectorizer transforms the body text from the reddit comments into features (i.e. words)
# and creates columns (vectors) with word counts for each comment

count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_future = df[df['target'] == 0]['body']

future_words= count_vect.fit_transform(vector_input_future)

# convert output to a Numpy array
future_words = future_words.toarray() 

In [None]:
future_matrix = pd.DataFrame(future_words, columns=count_vect.get_feature_names())

future_matrix.sum().sort_values(ascending=False).head(50)

In [None]:
future_matrix.mean().sort_values(ascending=False).head(50)

In [None]:
count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = stop_words, 
                             max_features = 10000, 
                             ngram_range=(1, 3)
                            ) 

# input for CountVectorizer is an array of strings
vector_input_elon = df[df['target'] == 1]['body']

# fit_transform the vectorizer
elon_words = count_vect.fit_transform(vector_input_elon)

# convert output to a Numpy array
elon_words = elon_words.toarray()

In [None]:
elon_matrix = pd.DataFrame(elon_words, columns=count_vect.get_feature_names())

elon_matrix.sum().sort_values(ascending=False).head(50)

In [None]:
elon_matrix.mean().sort_values(ascending=False).head(50)

#### TF-IDF Vectorizer

In [None]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

future_tf_words = tvec.fit_transform(vector_input_future)

future_tf_words = future_tf_words.toarray()

future_matrix = pd.DataFrame(future_tf_words, columns=tvec.get_feature_names())

future_matrix.sum().sort_values(ascending=False).head(50)

In [None]:
tvec = TfidfVectorizer(analyzer = "word", 
                     stop_words = stop_words, 
                     max_features = 10000, 
                     ngram_range = (1, 3))

elon_tf_words = tvec.fit_transform(vector_input_elon)

elon_tf_words = elon_tf_words.toarray()

elon_matrix = pd.DataFrame(elon_tf_words, columns=tvec.get_feature_names())

elon_matrix.sum().sort_values(ascending=False).head(50)

In [None]:
df.to_csv('comments_clean.csv', index=False)