In [24]:
#import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder



import spacy
from spellchecker import SpellChecker
import string
import emoji
from gensim.models import Word2Vec

import xgboost as xgb


%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
Punctuations = string.punctuation
nltk.download('stopwords')
sp_lem = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Amir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
df_train = pd.read_csv('Dataset/semeval-2017-train.csv', sep='\t')
df_test = pd.read_csv("Dataset/semeval-2017-test.csv", sep='\t')

In [4]:
df_test

Unnamed: 0,label,text
0,0,Trump is building a wall on the Mexican border...
1,-1,@lasinferencias & the WALL Trump wants to buil...
2,-1,President Elect? More like President Erect! A ...
3,0,"Ok, I know a lot of you think a wall on the Me..."
4,0,The Great Mexican Wall Deception: Trump's Amer...
...,...,...
11972,-1,Day 6 - Most annoying character : Ashley Graha...
11973,0,How Fidel Castro changed my life by Jewel L. C...
11974,1,Nick Young — Fidel Castro was a “Legend” | TMZ...
11975,0,"Cuba's Fidel Castro, 90, is dead. Survived 638..."


In [5]:
dfTrain = df_train.copy()
dfTest = df_test.copy()

In [6]:
def PreProcessingPipeline(df, columnName):
    #Remove Mentions and links:
    df[columnName] = df[columnName].str.replace(r'@\w+', '', regex=True)
    df[columnName] = df[columnName].str.replace(r'http\S+|www\S+|https\S+', '', regex=True)

    # Remove numbers
    df[columnName] = df[columnName].str.replace(r'\d+', '', regex=True)

    # Remove punctuation
    df[columnName] = df[columnName].str.replace(r'[^\w\s]', '', regex=True)

    # Remove emojis
    df[columnName] = df[columnName].apply(lambda x: emoji.replace_emoji(x, ''))

    #LowerCase
    df[columnName] = df[columnName].str.lower()

    # Remove extra whitespaces
    df[columnName] = df[columnName].str.strip().replace(r'\s+', ' ', regex=True)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    df[columnName] = df[columnName].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

    #Remove 'RT'
    df[columnName] = df[columnName].str.replace(r'\bRT\b', '', regex=True)

    #Lemmatization using spaCy
    df['lemmatized_text'] = df[columnName].apply(lambda x: " ".join([token.lemma_ for token in sp_lem(x)]))

    return df

In [7]:
ProcessedTrainDF = PreProcessingPipeline(dfTrain, 'text')
ProcessedTestDF = PreProcessingPipeline(dfTest,'text')

In [8]:
ProcessedTestDF

Unnamed: 0,label,text,lemmatized_text
0,0,trump building wall mexican border stop herrio...,trump building wall mexican border stop herrio...
1,-1,wall trump wants build researched would take b...,wall trump want build research would take bill...
2,-1,president elect like president erect wall mexi...,president elect like president erect wall mexi...
3,0,ok know lot think wall mexican border insane h...,ok know lot think wall mexican border insane h...
4,0,great mexican wall deception trumps america al...,great mexican wall deception trumps america al...
...,...,...,...
11972,-1,day annoying character ashley graham resident ...,day annoying character ashley graham resident ...
11973,0,fidel castro changed life jewel l crawford,fidel castro change life jewel l crawford
11974,1,nick young fidel castro legend tmz tv,nick young fidel castro legend tmz tv
11975,0,cubas fidel castro dead survived assassination...,cubas fidel castro dead survive assassination ...


In [9]:
ProcessedTrainDF

Unnamed: 0,label,text,lemmatized_text
0,1,one night like vegas make dat nigga famous,one night like vegas make dat nigga famous
1,1,walking chelsea time day rather lovely love lo...,walk chelsea time day rather lovely love londo...
2,0,first play night aaron rodgers intd udfa cb br...,first play night aaron rodger intd udfa cb bra...
3,0,drove bike today miles felt like jim carrey irene,drive bike today mile feel like jim carrey irene
4,-1,looking temp outsidehpw get hotter sun goes fe...,look temp outsidehpw get hot sun go feel like ...
...,...,...,...
49491,1,today rare day democrats get healthier club gr...,today rare day democrats get healthy club grow...
49492,1,rt today rare day democrats get healthier club...,rt today rare day democrats get healthy club g...
49493,-1,democrats quickly implode concede obama either...,democrats quickly implode concede obama either...
49494,-1,democrats spent night morning trying talk stoc...,democrats spend night morning try talk stock m...


In [10]:
ProcessedTrainDF[['label', 'lemmatized_text']]

Unnamed: 0,label,lemmatized_text
0,1,one night like vegas make dat nigga famous
1,1,walk chelsea time day rather lovely love londo...
2,0,first play night aaron rodger intd udfa cb bra...
3,0,drive bike today mile feel like jim carrey irene
4,-1,look temp outsidehpw get hot sun go feel like ...
...,...,...
49491,1,today rare day democrats get healthy club grow...
49492,1,rt today rare day democrats get healthy club g...
49493,-1,democrats quickly implode concede obama either...
49494,-1,democrats spend night morning try talk stock m...


In [11]:
ProcessedTrainDF['tokenized_text'] = ProcessedTrainDF['lemmatized_text'].apply(word_tokenize)

In [12]:
ProcessedTrainDF

Unnamed: 0,label,text,lemmatized_text,tokenized_text
0,1,one night like vegas make dat nigga famous,one night like vegas make dat nigga famous,"[one, night, like, vegas, make, dat, nigga, fa..."
1,1,walking chelsea time day rather lovely love lo...,walk chelsea time day rather lovely love londo...,"[walk, chelsea, time, day, rather, lovely, lov..."
2,0,first play night aaron rodgers intd udfa cb br...,first play night aaron rodger intd udfa cb bra...,"[first, play, night, aaron, rodger, intd, udfa..."
3,0,drove bike today miles felt like jim carrey irene,drive bike today mile feel like jim carrey irene,"[drive, bike, today, mile, feel, like, jim, ca..."
4,-1,looking temp outsidehpw get hotter sun goes fe...,look temp outsidehpw get hot sun go feel like ...,"[look, temp, outsidehpw, get, hot, sun, go, fe..."
...,...,...,...,...
49491,1,today rare day democrats get healthier club gr...,today rare day democrats get healthy club grow...,"[today, rare, day, democrats, get, healthy, cl..."
49492,1,rt today rare day democrats get healthier club...,rt today rare day democrats get healthy club g...,"[rt, today, rare, day, democrats, get, healthy..."
49493,-1,democrats quickly implode concede obama either...,democrats quickly implode concede obama either...,"[democrats, quickly, implode, concede, obama, ..."
49494,-1,democrats spent night morning trying talk stoc...,democrats spend night morning try talk stock m...,"[democrats, spend, night, morning, try, talk, ..."


In [13]:
word2vec_model = Word2Vec(ProcessedTrainDF['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

In [14]:
def vectorize_text(text, model):
    vector = np.mean([model.wv[word] for word in text if word in model.wv.key_to_index], axis=0)
    return np.zeros(model.vector_size) if np.isnan(vector).any() else vector


In [15]:
ProcessedTrainDF['vectorized_text'] = ProcessedTrainDF['tokenized_text'].apply(lambda x: vectorize_text(x, word2vec_model))

In [16]:
ProcessedTrainDF

Unnamed: 0,label,text,lemmatized_text,tokenized_text,vectorized_text
0,1,one night like vegas make dat nigga famous,one night like vegas make dat nigga famous,"[one, night, like, vegas, make, dat, nigga, fa...","[0.23205236, 0.5465427, -0.07214877, 0.2454151..."
1,1,walking chelsea time day rather lovely love lo...,walk chelsea time day rather lovely love londo...,"[walk, chelsea, time, day, rather, lovely, lov...","[0.2685394, 0.47559628, 0.0845582, 0.30179358,..."
2,0,first play night aaron rodgers intd udfa cb br...,first play night aaron rodger intd udfa cb bra...,"[first, play, night, aaron, rodger, intd, udfa...","[-0.02310304, 0.2068378, -0.032626383, 0.15810..."
3,0,drove bike today miles felt like jim carrey irene,drive bike today mile feel like jim carrey irene,"[drive, bike, today, mile, feel, like, jim, ca...","[0.12879832, 0.443553, 0.02636228, 0.2027146, ..."
4,-1,looking temp outsidehpw get hotter sun goes fe...,look temp outsidehpw get hot sun go feel like ...,"[look, temp, outsidehpw, get, hot, sun, go, fe...","[0.36774287, 0.77125376, 0.122153625, 0.028812..."
...,...,...,...,...,...
49491,1,today rare day democrats get healthier club gr...,today rare day democrats get healthy club grow...,"[today, rare, day, democrats, get, healthy, cl...","[0.22731873, 0.3172918, 0.023001822, 0.1333054..."
49492,1,rt today rare day democrats get healthier club...,rt today rare day democrats get healthy club g...,"[rt, today, rare, day, democrats, get, healthy...","[0.20640317, 0.3416896, 0.023338497, 0.1265540..."
49493,-1,democrats quickly implode concede obama either...,democrats quickly implode concede obama either...,"[democrats, quickly, implode, concede, obama, ...","[-0.09366981, 0.2199023, -0.017313225, 0.06900..."
49494,-1,democrats spent night morning trying talk stoc...,democrats spend night morning try talk stock m...,"[democrats, spend, night, morning, try, talk, ...","[0.15084018, 0.41299924, -0.1441107, 0.1739367..."


In [17]:
len(ProcessedTrainDF.iloc[10,4])

100

In [18]:
X = np.array(ProcessedTrainDF['vectorized_text'].tolist())
y = ProcessedTrainDF['label']

In [19]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=1402)


In [21]:
y_encoded

array([2, 2, 1, ..., 0, 0, 0], dtype=int64)

In [22]:
classifierRF = RandomForestClassifier()
classifierRF.fit(X_train, y_train)

predictions = classifierRF.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.43      0.12      0.19      1507
           1       0.55      0.68      0.61      4488
           2       0.58      0.59      0.59      3905

    accuracy                           0.56      9900
   macro avg       0.52      0.46      0.46      9900
weighted avg       0.55      0.56      0.54      9900



In [25]:
clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', max_depth = 9 , n_estimatiors= 500)

In [26]:
clf.fit(X_train, y_train)

In [27]:
train_predictions_clf = clf.predict(X_train)
test_predictions_clf = clf.predict(X_test)

In [28]:
train_accuracy = accuracy_score(y_train, train_predictions_clf)
test_accuracy = accuracy_score(y_test, test_predictions_clf)

In [29]:
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 99.94%
Test Accuracy: 55.03%
