In [1]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/arko/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import re
import numpy as np
import pandas as pd
import string
# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
# sklearn
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
# vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
reddit_data_extracted = pd.read_csv('../data/reddit_comments_large.csv')

In [4]:
reddit_data_extracted

Unnamed: 0.1,Unnamed: 0,team,title,selftext,comment,replies
0,0,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,we are going to win 100 games in a row,Happy cake day
1,1,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,He’s definitely lurking in this sub,Ayyyy. He should feel free to slide into my DM...
2,2,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,![gif](giphy|11zTEl7fbwml68),
3,3,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,Best Italian born on May 19th in history,I had to go have a look... Pirlo???
4,4,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,![gif](giphy|wYThr3gjSU81Q4cuFY),
...,...,...,...,...,...,...
10862,10862,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"Hi all,\n\nI bought a ticket to Saints vs Chel...",
10863,10863,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"Saints fans, wanted to get an idea for FPL: wh...",
10864,10864,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,Is that a bad thing though? Hopefully a season...,agreed!
10865,10865,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"I don't think so. \n\nLooking back, he didn't ...",


In [5]:
vader_analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(clean_comment):
    if isinstance(clean_comment, str):
        sentiment = vader_analyzer.polarity_scores(clean_comment)
        return sentiment['compound']
    return None  

In [6]:
reddit_data_extracted['clean_comment'] = reddit_data_extracted['comment'].fillna('')  # Replace NaN with an empty string
reddit_data_extracted['clean_replies'] = reddit_data_extracted['replies'].fillna('')  # Replace NaN with an empty string
reddit_data_extracted['vader_comment_sentiment'] = reddit_data_extracted['clean_comment'].apply(vader_sentiment)
reddit_data_extracted['vader_replies_sentiment'] = reddit_data_extracted['clean_replies'].apply(vader_sentiment)

In [7]:
reddit_data_extracted['overall_sentiment'] = (reddit_data_extracted['vader_comment_sentiment'] + reddit_data_extracted['vader_replies_sentiment'])/2

In [8]:
# we have all the labels for our data. We now wish to run our analysis on this to see what we get
reddit_data_extracted

Unnamed: 0.1,Unnamed: 0,team,title,selftext,comment,replies,clean_comment,clean_replies,vader_comment_sentiment,vader_replies_sentiment,overall_sentiment
0,0,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,we are going to win 100 games in a row,Happy cake day,we are going to win 100 games in a row,Happy cake day,0.5859,0.5719,0.57890
1,1,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,He’s definitely lurking in this sub,Ayyyy. He should feel free to slide into my DM...,He’s definitely lurking in this sub,Ayyyy. He should feel free to slide into my DM...,0.2960,0.7351,0.51555
2,2,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,![gif](giphy|11zTEl7fbwml68),,![gif](giphy|11zTEl7fbwml68),,0.0000,0.0000,0.00000
3,3,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,Best Italian born on May 19th in history,I had to go have a look... Pirlo???,Best Italian born on May 19th in history,I had to go have a look... Pirlo???,0.6369,0.0000,0.31845
4,4,Arsenal,Calafiori's Instagram Story. Can't wait to see...,,![gif](giphy|wYThr3gjSU81Q4cuFY),,![gif](giphy|wYThr3gjSU81Q4cuFY),,0.0000,0.0000,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
10862,10862,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"Hi all,\n\nI bought a ticket to Saints vs Chel...",,"Hi all,\n\nI bought a ticket to Saints vs Chel...",,-0.3491,0.0000,-0.17455
10863,10863,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"Saints fans, wanted to get an idea for FPL: wh...",,"Saints fans, wanted to get an idea for FPL: wh...",,0.0000,0.0000,0.00000
10864,10864,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,Is that a bad thing though? Hopefully a season...,agreed!,Is that a bad thing though? Hopefully a season...,agreed!,0.3818,0.3382,0.36000
10865,10865,Southampton,Free Talk Friday,Yes it's back! \n\nTalk about anything and ev...,"I don't think so. \n\nLooking back, he didn't ...",,"I don't think so. \n\nLooking back, he didn't ...",,0.8016,0.0000,0.40080


In [35]:
#categorisation:
reddit_data_extracted['category'] = np.where(reddit_data_extracted['overall_sentiment'] >= 0, 1, -1)

In [36]:
clean_comment, clean_replies, sentiment = list(reddit_data_extracted['clean_comment']), list(reddit_data_extracted['clean_replies']), list(reddit_data_extracted['category'])


In [37]:
data = reddit_data_extracted[['clean_comment','clean_replies','category']]

In [60]:
data_pos = data[data['category'] == 1]
# data_neut = data[data['category'] == 0]
data_neg = data[data['category'] == -1]

dataset = pd.concat([data_pos, data_neg])

In [61]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

STOPWORDS = set(stopwordlist)
punctuations_list = string.punctuation

In [62]:
#Converting all clean_comment to lowercase for uniformity
dataset['clean_comment']=dataset['clean_comment'].str.lower()
dataset['clean_replies']=dataset['clean_replies'].str.lower()

In [63]:
'''

Now we create a few function to help us with cleaning of data
1. Removing stopwords from the clean_comment
2. Defining and applying a function to remove punctuation
3. Removing any urls from the clean_comment
4. Removing any numbers present in our clean_comment

'''
def cleaning_stopwords(clean_comment):
    return " ".join([word for word in str(clean_comment).split() if word not in STOPWORDS])

def cleaning_punctuations(clean_comment):
    translator = str.maketrans('', '', punctuations_list)
    return clean_comment.translate(translator)

# def cleaning_repeating_char(clean_comment):
#     return re.sub(r'(.)1+', r'1', clean_comment)

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

In [64]:
dataset['clean_comment'] = dataset['clean_comment'].apply(lambda clean_comment: cleaning_stopwords(clean_comment))
dataset['clean_comment']= dataset['clean_comment'].apply(lambda x: cleaning_punctuations(x))
dataset['clean_comment'] = dataset['clean_comment'].apply(lambda x: cleaning_URLs(x))
dataset['clean_comment'] = dataset['clean_comment'].apply(lambda x: cleaning_numbers(x))

In [65]:
dataset['clean_replies'] = dataset['clean_replies'].apply(lambda clean_replies: cleaning_stopwords(clean_replies))
dataset['clean_replies']= dataset['clean_replies'].apply(lambda x: cleaning_punctuations(x))
dataset['clean_replies'] = dataset['clean_replies'].apply(lambda x: cleaning_URLs(x))
dataset['clean_replies'] = dataset['clean_replies'].apply(lambda x: cleaning_numbers(x))

In [66]:
# reset index for consistency 
dataset = dataset.reset_index(drop=True)

In [67]:
lm = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

# created a function for lemmatizing the clean_column
def lemmatizer_on_clean_column(data):
    clean_column = [lm.lemmatize(word) for word in data]
    return clean_column

In [68]:
dataset['clean_comment'] = dataset['clean_comment'].apply(tokenizer.tokenize)
dataset['clean_comment'] = dataset['clean_comment'].apply(lambda x: lemmatizer_on_clean_column(x))

In [69]:
dataset['clean_replies'] = dataset['clean_replies'].apply(tokenizer.tokenize)
dataset['clean_replies'] = dataset['clean_replies'].apply(lambda x: lemmatizer_on_clean_column(x))

In [70]:
#comment:
X_comment = dataset.clean_comment
y = dataset.category

# Converting tokenized clean_comment back to strings
X_comment = X_comment.apply(lambda x: ' '.join(x))

# Separating the 80% data for training data and 20% for testing data with a set random state to ensure same results
X_train_comment, X_test_comment, y_train, y_test = train_test_split(X_comment,y,test_size = 0.2, random_state =26105111)

vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
vectoriser.fit(X_train_comment)
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

X_train_comment = vectoriser.transform(X_train_comment)
X_test_comment  = vectoriser.transform(X_test_comment)

No. of feature_words:  10000


In [71]:
y_test

7813    1
2343    1
3202    1
6946    1
2859    1
       ..
9336   -1
3662    1
2527    1
1014    1
793     1
Name: category, Length: 2174, dtype: int64

In [72]:
#comment:
X_replies = dataset.clean_replies
y = dataset.category

# Converting tokenized clean_comment back to strings
X_replies = X_replies.apply(lambda x: ' '.join(x))

# Separating the 80% data for training data and 20% for testing data with a set random state to ensure same results
X_train_replies, X_test_replies, y_train, y_test = train_test_split(X_replies,y,test_size = 0.2, random_state =26105111)

vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
vectoriser.fit(X_train_replies)
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

X_train_replies = vectoriser.transform(X_train_replies)
X_test_replies  = vectoriser.transform(X_test_replies)

No. of feature_words:  10000


In [73]:
def model_Evaluate_comment(model):
    y_pred_comment = model.predict(X_test_comment)

def model_Evaluate_replies(model):
    y_pred_replies = model.predict(X_test_replies)


In [74]:
BNBmodel_comment = BernoulliNB()
BNBmodel_comment.fit(X_train_comment, y_train)
model_Evaluate_comment(BNBmodel_comment)
y_pred_comment1 = BNBmodel_comment.predict(X_test_comment)

In [75]:
y_pred_comment1

array([1, 1, 1, ..., 1, 1, 1])

In [76]:
BNBmodel_replies = BernoulliNB()
BNBmodel_replies.fit(X_train_replies, y_train)
model_Evaluate_replies(BNBmodel_replies)
y_pred_replies1 = BNBmodel_replies.predict(X_test_replies)

In [77]:
y_pred_replies1

array([1, 1, 1, ..., 1, 1, 1])

In [78]:
combined_y_pred = (y_pred_replies1 + y_pred_comment1)/2
combined_y_pred

array([1., 1., 1., ..., 1., 1., 1.])

In [79]:
print(classification_report(y_test, combined_y_pred))
cf_matrix = confusion_matrix(y_test, combined_y_pred)
print(cf_matrix)

              precision    recall  f1-score   support

        -1.0       0.67      0.07      0.12       559
         0.0       0.00      0.00      0.00         0
         1.0       0.79      0.88      0.84      1615

    accuracy                           0.67      2174
   macro avg       0.49      0.32      0.32      2174
weighted avg       0.76      0.67      0.65      2174

[[  38  144  377]
 [   0    0    0]
 [  19  167 1429]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [80]:
LRmodel_comment  = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel_comment.fit(X_train_comment, y_train)
model_Evaluate_comment(LRmodel_comment)
y_pred_comment2 = LRmodel_comment.predict(X_test_comment)

LRmodel_replies  = LogisticRegression(C = 2, max_iter = 1000, n_jobs=-1)
LRmodel_replies.fit(X_train_replies, y_train)
model_Evaluate_comment(LRmodel_replies)
y_pred_replies2 = LRmodel_replies.predict(X_test_replies)

combined_y_pred_2 = (y_pred_replies2 + y_pred_comment2)/2
combined_y_pred_2

array([1., 1., 1., ..., 1., 1., 1.])

In [81]:
print(classification_report(y_test, combined_y_pred_2))
cf_matrix = confusion_matrix(y_test, combined_y_pred_2)
print(cf_matrix)

              precision    recall  f1-score   support

        -1.0       1.00      0.02      0.04       559
         0.0       0.00      0.00      0.00         0
         1.0       0.81      0.97      0.88      1615

    accuracy                           0.72      2174
   macro avg       0.60      0.33      0.31      2174
weighted avg       0.86      0.72      0.67      2174

[[  12  181  366]
 [   0    0    0]
 [   0   56 1559]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
