# Sentiment Analysis on Reddit Data

In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
data=pd.read_csv(r"C:\Users\Downloads\Reddit_Data.csv")
data.head()

Unnamed: 0,comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [3]:
#Preprocessing

In [4]:
#Removing null values
data=data.dropna()
data = data.reset_index(drop=True)
data

Unnamed: 0,comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1
...,...,...
37144,jesus,0
37145,kya bhai pure saal chutiya banaya modi aur jab...,1
37146,downvote karna tha par upvote hogaya,0
37147,haha nice,1


In [5]:
#Removing neutral values
data=data[data['category']!=0]
data['label']=np.where(data['category']==1,1,0)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label']=np.where(data['category']==1,1,0)


Unnamed: 0,comment,category,label
0,family mormon have never tried explain them t...,1,1
1,buddhism has very much lot compatible with chr...,1,1
2,seriously don say thing first all they won get...,-1,0
4,for your own benefit you may want read living ...,1,1
5,you should all sit down together and watch the...,-1,0


In [6]:
#Lower casing
data['pre_process'] = data['comment'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process'] = data['comment'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon have never tried explain them th...
1,buddhism has very much lot compatible with chr...,1,1,buddhism has very much lot compatible with chr...
2,seriously don say thing first all they won get...,-1,0,seriously don say thing first all they won get...
4,for your own benefit you may want read living ...,1,1,for your own benefit you may want read living ...
5,you should all sit down together and watch the...,-1,0,you should all sit down together and watch the...


In [7]:
#Remove the HTML tags and URLs from the reviews.
data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r'http\S+', '', x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r'http\S+', '', x))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon have never tried explain them th...
1,buddhism has very much lot compatible with chr...,1,1,buddhism has very much lot compatible with chr...
2,seriously don say thing first all they won get...,-1,0,seriously don say thing first all they won get...
4,for your own benefit you may want read living ...,1,1,for your own benefit you may want read living ...
5,you should all sit down together and watch the...,-1,0,you should all sit down together and watch the...


In [8]:
#Removing word contractions
def contractions(s):
    s = re.sub(r"won't", "will not",s)
    s = re.sub(r"would't", "would not",s)
    s = re.sub(r"could't", "could not",s)
    s = re.sub(r"\'d", " would",s)
    s = re.sub(r"can\'t", "can not",s)
    s = re.sub(r"n\'t", " not", s)
    s= re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    return s
data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x:contractions(x))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon have never tried explain them th...
1,buddhism has very much lot compatible with chr...,1,1,buddhism has very much lot compatible with chr...
2,seriously don say thing first all they won get...,-1,0,seriously don say thing first all they won get...
4,for your own benefit you may want read living ...,1,1,for your own benefit you may want read living ...
5,you should all sit down together and watch the...,-1,0,you should all sit down together and watch the...


In [9]:
#Removing non-alpha characters
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: " ".join([re.sub('[^A-Za-z]+','', x) for x in nltk.word_tokenize(x)]))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon have never tried explain them th...
1,buddhism has very much lot compatible with chr...,1,1,buddhism has very much lot compatible with chr...
2,seriously don say thing first all they won get...,-1,0,seriously don say thing first all they won get...
4,for your own benefit you may want read living ...,1,1,for your own benefit you may want read living ...
5,you should all sit down together and watch the...,-1,0,you should all sit down together and watch the...


In [10]:
#Removing extra spaces
data['pre_process']=data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: re.sub(' +', ' ', x))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon have never tried explain them th...
1,buddhism has very much lot compatible with chr...,1,1,buddhism has very much lot compatible with chr...
2,seriously don say thing first all they won get...,-1,0,seriously don say thing first all they won get...
4,for your own benefit you may want read living ...,1,1,for your own benefit you may want read living ...
5,you should all sit down together and watch the...,-1,0,you should all sit down together and watch the...


In [11]:
#Removing stopwords
stop = stopwords.words('english')
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon never tried explain still stare ...
1,buddhism has very much lot compatible with chr...,1,1,buddhism much lot compatible christianity espe...
2,seriously don say thing first all they won get...,-1,0,seriously say thing first get complex explain ...
4,for your own benefit you may want read living ...,1,1,benefit may want read living buddha living chr...
5,you should all sit down together and watch the...,-1,0,sit together watch simpsons episode lisa becom...


In [12]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
data['pre_process']=data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pre_process']=data['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))


Unnamed: 0,comment,category,label,pre_process
0,family mormon have never tried explain them t...,1,1,family mormon never tried explain still stare ...
1,buddhism has very much lot compatible with chr...,1,1,buddhism much lot compatible christianity espe...
2,seriously don say thing first all they won get...,-1,0,seriously say thing first get complex explain ...
4,for your own benefit you may want read living ...,1,1,benefit may want read living buddha living chr...
5,you should all sit down together and watch the...,-1,0,sit together watch simpson episode lisa become...


In [13]:
#Word Embedding Matrix

In [14]:
#Splitting into train and test datasets
X_train,X_test,Y_train, Y_test = train_test_split(data['pre_process'], data['label'], test_size=0.25, random_state=30)
print("Train: ",X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train:  (18080,) (18080,) Test:  ((6027,), (6027,))


In [15]:
vectorizer= TfidfVectorizer()
tf_x_train = vectorizer.fit_transform(X_train)
tf_x_test = vectorizer.transform(X_test)

In [16]:
#Model Building

In [17]:
#Using SVM algorithm
svm = LinearSVC(random_state=0)

#Fitting the Training data into model
svm.fit(tf_x_train,Y_train)

#Predicting the test data
y_test_pred_svm=svm.predict(tf_x_test)
y_test_pred_svm

array([0, 0, 1, ..., 1, 1, 0])

In [18]:
#Using Logistic Regression
lr = LogisticRegression(max_iter=1000,solver='saga')

#Fitting the Training data into model
lr.fit(tf_x_train,Y_train)

y_test_pred_lr=lr.predict(tf_x_test)
y_test_pred_lr

array([1, 0, 1, ..., 1, 1, 0])

In [19]:
#Testing model performance

In [20]:
#SVM model accuracy
report=classification_report(Y_test, y_test_pred_svm,output_dict=True)
report

{'0': {'precision': 0.8236228813559322,
  'recall': 0.7483156881616939,
  'f1-score': 0.7841654059505799,
  'support': 2078},
 '1': {'precision': 0.8736409760811791,
  'recall': 0.9156748543935174,
  'f1-score': 0.894164193867458,
  'support': 3949},
 'accuracy': 0.8579724572755931,
 'macro avg': {'precision': 0.8486319287185556,
  'recall': 0.8319952712776056,
  'f1-score': 0.839164799909019,
  'support': 6027},
 'weighted avg': {'precision': 0.856395646590709,
  'recall': 0.8579724572755931,
  'f1-score': 0.8562386121035169,
  'support': 6027}}

In [21]:
#Logistic Regression Accuracy
report=classification_report(Y_test, y_test_pred_lr,output_dict=True)
report

{'0': {'precision': 0.8776928422515636,
  'recall': 0.6077959576515881,
  'f1-score': 0.7182257605914132,
  'support': 2078},
 '1': {'precision': 0.822362685265911,
  'recall': 0.9554317548746518,
  'f1-score': 0.8839170668853228,
  'support': 3949},
 'accuracy': 0.8355732536917206,
 'macro avg': {'precision': 0.8500277637587373,
  'recall': 0.78161385626312,
  'f1-score': 0.801071413738368,
  'support': 6027},
 'weighted avg': {'precision': 0.8414395172247938,
  'recall': 0.8355732536917206,
  'f1-score': 0.8267897175442337,
  'support': 6027}}