# Training a machine learning model
<br />
<br />
  In this Notebook i will train a machine learning algorithm to perform a sentiment analysis. I use a dataset i found on Kaggle.com with Tweets that have already been annotated with the apporiate sentiment. The first step is a little exploratory data analysis to see how the data is strucutred. Then i will apply various preprocessing steps to clean the data. Finally i will train some ML algorithms and try to determine under which condidtions they perform best.

In [1]:
import pandas as pd
import numpy as np
import re
import pycld2 as cld2
import nltk
import pickle

import matplotlib.pyplot as plt
from collections import Counter

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC

from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#nltk.download('stopwords')

In [2]:
training_data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1')

# Exploratory Data Analysis

In [7]:
training_data.head()

Unnamed: 0,target,id,tweet
0,0,1467810672,is upset that he can't update his Facebook by ...
1,0,1467810917,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,my whole body feels itchy and like its on fire
3,0,1467811193,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,@Kwesidei not the whole crew


In [4]:
training_data.columns = ['target', "id", "date", "flag", "username", "tweet"]

In [5]:
training_data = training_data.drop(columns=['date', 'flag','username'])

In [6]:
training_data.value_counts(['target'])

target
4         800000
0         799999
dtype: int64

# Data Preprocessing
<br />
<br />
First i will remove contractions and replace them to provide better results and more consistant usage of the language and and reduce the noise a little bit. Also popular abbreviations like 'u' for you and '4' for 'for' are replaced too.  
Then i clean the tweets: i remove @mentions completly because they add no valuable data. Hashtags (the symbol itself) are removed but not the actual hashtags itself. Hyperlinks and special characters are also removed as well as more than 2 characters.  
In the next step i remove all the stop words.
Finally the verbs are lemmatized.

In [8]:
# replace common contractions and abbreviations
def repl_contract(text):
    
    text = text.lower()
    cont = {
        "aren't" : 'are not',
        "can't" : 'cannot',
        "couldn't" : 'could not',
        "don't" : "do not",
        "didn't" : 'did not',
        "doesn't" : 'does not',
        "hadn't" : 'had not',
        "haven't" : 'have not',
        "hasn't" : "has not",
        "how's" : "how is",
        "he's" : 'he is',
        "she's" : 'she is',
        "he'll" : "he will",        
        "she'll" : 'she will',
        "he'd" : "he would",
        "she'd" : "she would",
        "here's" : "here is", 
        "i'm" : 'i am',
        "i've" : "i have",
        "i'll" : "i will",
        "i'd" : "i would",
        "isn't": "is not", 
        "it's" : "it is",
        "it'll" : "it will",
        "mustn't" : "must not",
        "shouldn't" : "should not",
        "that's" : "that is", 
        "there's" : "there is",
        "they're" : "they are",
        "they've" : "they have",
        "they'll" : "they will",
        "they'd" : "they would",
        "wasn't" : "was not",
        "we're" : "we are",
        "we've" : "we have",
        "we'll" : "we will", 
        "we'd" : "we would",
        "weren't" : "were not",
        "what's" : "what is",
        "when's" : "when is",
        "why's" : "why is",
        "where's" : "where is",
        "who's" : "who is",
        "who'll" : "who will",
        "won't" : "will not",
        "wouldn't" : "would not",
        "you're" : "you are",
        "you've" : "you have",
        "you'll" : "you will",
        "you'd" : "you would",
        "mayn't" : "may not",
        "4" : "for",
        "2" : "to",
        "1" : "one",
        "u" : "you",
        "r" : "are",
        "amp" : "",
        "re" : "",
        "gimme" : "give me",
        "gonna" : "going to",
        "cause" : "because",
        "imma" : "i am going to",
        "wanna" : "want to",
        "gotta" : "got to",
        "woulda" : "would have",
        "coulda" : "could have",
        "shoulda" : "should have",
        "let's" : "let us",
        "y'all" : "you all",
            }
    
    cleaned_text = []
    words = text.split()
    
    for word in words:
        if word in cont:
            cleaned_text.append(cont[word])
        else:
            cleaned_text.append(word)
    text = ' '.join(cleaned_text)
        
    return text

In [9]:
# clean the tweets
def clean(text):
    text = text.lower() # lower case
    text = re.sub(r"@[A-Za-z0-9]+","", text) # remove @mentions
    text = re.sub(r"#","", text) # remove #
    text = re.sub(r"\ART[\s]+","", text) # remove RT
    text = re.sub(r"https?:\/\/\S+","", text) # remove hyperlink
    text = re.sub("(.)\\1{2,}","\\1", text) # remove more than two characters.
    text = re.sub(r"[^A-Za-z0-9\s]+", " ",str(text)) #remove special characters
    
    return text

In [10]:
stop_words = set(stopwords.words('english'))

# remove stopwords
def remv_stopw(text):

    cleaned_text = []
    words = word_tokenize(text) 

    for word in words:
        if not word in stop_words:
                cleaned_text.append(word)
            
    return ' '.join(cleaned_text)

In [11]:
lemmatizer = WordNetLemmatizer()

# lemmatize verbs 
def lemma(text, stem=False, lemmatize=False):

    cleaned_text = []
    words = word_tokenize(text) 

    for word in words:
        cleaned_text.append(lemmatizer.lemmatize(word,pos="v"))
            
    return ' '.join(cleaned_text)

In [12]:
# detect language
def detect_lang(tweet):
    try:
        isReliable, textBytesFound, details = cld2.detect(tweet)
        return details[0][0]
    except:
        return "not found"

I save various steps of my cleaned data to test if the algorithms benefit from removing stop words and lemmatizing.

 + `clean_tweet`= Replaced contractions and common abbreviations as well as removed hyperlinks, special characters and etc.
 + `clean_tweet_nostop`= Removed stop words also.  
 - `clean_tweet_nostop_lemma`= Lemmatized the verbs.

In [13]:
training_data['clean_tweet'] = training_data['tweet'].apply(repl_contract)
training_data['clean_tweet'] = training_data['clean_tweet'].apply(clean)
training_data['clean_tweet_nostop'] = training_data['clean_tweet'].apply(remv_stopw)
training_data['clean_tweet_nostop_lemma'] = training_data['clean_tweet_nostop'].apply(lemma)

In [14]:
training_data.head()

Unnamed: 0,target,id,tweet,clean_tweet,clean_tweet_nostop,clean_tweet_nostop_lemma
0,0,1467810672,is upset that he can't update his Facebook by ...,is upset that he cannot update his facebook by...,upset update facebook texting might cry result...,upset update facebook texting might cry result...
1,0,1467810917,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to s...,dived many times ball managed save 50 rest go ...,dive many time ball manage save 50 rest go bound
2,0,1467811184,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,whole body feels itchy like fire,whole body feel itchy like fire
3,0,1467811193,"@nationwideclass no, it's not behaving at all....",no it is not behaving at all i am mad why ...,behaving mad see,behave mad see
4,0,1467811372,@Kwesidei not the whole crew,not the whole crew,whole crew,whole crew


In [15]:
training_data['language'] = training_data['clean_tweet'].apply(detect_lang)

In [None]:
Counter(" ".join(training_data['language']).split()).most_common(20)

In [16]:
training_data['language'].replace({'SCOTS': 'ENGLISH'}, inplace=True)

In [17]:
training_data = training_data[training_data["language"] == "ENGLISH"]
training_data.value_counts(['target'])

target
0         768278
4         757892
dtype: int64

Right now my data is skewed a little. Thats why i take a even number of positive and negative tweets to get the best results for my algorithms.

In [18]:
neg = training_data.loc[training_data['target'] == 0]
pos = training_data.loc[training_data['target'] == 4]

pos = pos[0:750000]
neg = neg[0:750000]

training_data = pos.append(neg,ignore_index = True)
training_data.value_counts(['target'])

target
4         750000
0         750000
dtype: int64

In [19]:
training_data.head()

Unnamed: 0,target,id,tweet,clean_tweet,clean_tweet_nostop,clean_tweet_nostop_lemma,language
0,4,1467822272,I LOVE @Health4UandPets u guys r the best!!,i love you guys are the best,love guys best,love guy best,ENGLISH
1,4,1467822273,im meeting up with one of my besties tonight! ...,im meeting up with one of my besties tonight ...,im meeting one besties tonight cant wait girl ...,im meet one besties tonight cant wait girl talk,ENGLISH
2,4,1467822283,"@DaRealSunisaKim Thanks for the Twitter add, S...",thanks for the twitter add sunisa i got to ...,thanks twitter add sunisa got meet hin show dc...,thank twitter add sunisa get meet hin show dc ...,ENGLISH
3,4,1467822287,Being sick can be really cheap when it hurts t...,being sick can be really cheap when it hurts t...,sick really cheap hurts much eat real food plu...,sick really cheap hurt much eat real food plus...,ENGLISH
4,4,1467822293,@LovesBrooklyn2 he has that effect on everyone,he has that effect on everyone,effect everyone,effect everyone,ENGLISH


# Machine Learning
<br />
<br />
In this part i train various machine learning algorithms and evalute their accuracy.  
The algortihms i used are:

 + Linear Regressio
 - Passive Aggressive
 - Ridge Regression
 - Linear Support Vector Machines
 - ADA Boost

In [24]:
def train_model_LR(train_data, targets):
    X_train, X_test, Y_train, Y_test = train_test_split(train_data, targets, test_size=0.1)

    model = Pipeline([('vect', HashingVectorizer(ngram_range=(1, 2))),
                      ('clf', LogisticRegression(max_iter=1000, solver="saga")),
              ])
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

In [None]:
def train_model_PA(train_data, targets):
    X_train, X_test, Y_train, Y_test = train_test_split(train_data, targets, test_size=0.2)

    model = Pipeline([('vect', HashingVectorizer(ngram_range=(2, 2))),
                      ('clf', PassiveAggressiveClassifier()),
              ])
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

In [None]:
def train_model_RR(train_data, targets):
    X_train, X_test, Y_train, Y_test = train_test_split(train_data, targets, test_size=0.2)

    model = Pipeline([('vect', HashingVectorizer(ngram_range=(2, 2))),
                      ('clf', RidgeClassifier()),
              ])
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

In [None]:
def train_model_SVC(train_data, targets):
    X_train, X_test, Y_train, Y_test = train_test_split(train_data, targets, test_size=0.2)

    model = Pipeline([('vect', HashingVectorizer(ngram_range=(2, 2))),
                      ('clf', LinearSVC(random_state=0, tol=1e-5)),
              ])
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

In [None]:
def train_model_ADA(train_data, targets):
    X_train, X_test, Y_train, Y_test = train_test_split(train_data, targets, test_size=0.2)

    model = Pipeline([('vect', HashingVectorizer(ngram_range=(1, 2))),
                      ('clf', AdaBoostClassifier(n_estimators=100, random_state=0)),
              ])
    model.fit(X_train, Y_train)

    return model, X_test, Y_test

In [22]:
def check_model_metrics(model, test_data, test_targets):
    y_pred = model.predict(test_data)

    print("ACCURACY:")
    print(metrics.accuracy_score(test_targets, y_pred)*100)

    print("\nCONFUSION MATRIX")
    print(confusion_matrix(test_targets, y_pred))

    print("\nCLASSIFICATION REPORT")
    print(classification_report(test_targets, y_pred))

Three different sets of training data for testing the algorithms.
-clean_tweet = removed contractions and punctuation etc..
-clean_tweet_nostop = removed stopwords
-clean_tweet_nostop_lemma = leammtized and no stop words

In [21]:
train_data_1 = training_data['clean_tweet']
train_data_2 = training_data['clean_tweet_nostop']
train_data_3 = training_data['clean_tweet_nostop_lemma']
targets = training_data["target"]

### Linear Regresssion

In [25]:
model_LR, x_test, y_test = train_model_LR(train_data_1, targets)
check_model_metrics(model_LR, x_test, y_test)

ACCURACY:
82.074

CONFUSION MATRIX
[[60938 14194]
 [12695 62173]]

CLASSIFICATION REPORT
              precision    recall  f1-score   support

           0       0.83      0.81      0.82     75132
           4       0.81      0.83      0.82     74868

    accuracy                           0.82    150000
   macro avg       0.82      0.82      0.82    150000
weighted avg       0.82      0.82      0.82    150000



In [None]:
model_LR, x_test, y_test = train_model_LR(train_data_2, targets)
check_model_metrics(model_LR, x_test, y_test)

In [None]:
model_LR, x_test, y_test = train_model_LR(train_data_3, targets)
check_model_metrics(model_LR, x_test, y_test)

### Passive Agressive

In [None]:
model_PA, x_test, y_test = train_model_PA(train_data_1, targets)
check_model_metrics(model_PA, x_test, y_test)

In [None]:
model_PA, x_test, y_test = train_model_PA(train_data_2, targets)
check_model_metrics(model_PA, x_test, y_test)

In [None]:
model_PA, x_test, y_test = train_model_PA(train_data_3, targets)
check_model_metrics(model_PA, x_test, y_test)

### Ridge Regression

In [None]:
model_RR, x_test, y_test = train_model_RR(train_data_1, targets)
check_model_metrics(model_RR, x_test, y_test)

In [None]:
model_RR, x_test, y_test = train_model_RR(train_data_2, targets)
check_model_metrics(model_RR, x_test, y_test)

In [None]:
model_RR, x_test, y_test = train_model_RR(train_data_3, targets)
check_model_metrics(model_RR, x_test, y_test)

### Support Vector Machines

In [None]:
model_SVC, x_test, y_test = train_model_SVC(train_data_1, targets)
check_model_metrics(model_SVC, x_test, y_test)

In [None]:
model_SVC, x_test, y_test = train_model_SVC(train_data_2, targets)
check_model_metrics(model_SVC, x_test, y_test)

In [None]:
model_SVC, x_test, y_test = train_model_SVC(train_data_3, targets)
check_model_metrics(model_SVC, x_test, y_test)

### ADA Boost

In [None]:
model_ADA, x_test, y_test = train_model_ADA(train_data_1, targets)
check_model_metrics(model_ADA, x_test, y_test)

In [None]:
model_ADA, x_test, y_test = train_model_ADA(train_data_2, targets)
check_model_metrics(model_ADA, x_test, y_test)

In [None]:
model_ADA, x_test, y_test = train_model_ADA(train_data_3, targets)
check_model_metrics(model_ADA, x_test, y_test)

It seems like the algorithms perform best on the data set that didn't have the stop words removed as well as no lemmatization. So i will use the same apporoach on the election tweets.


I save the three best performing algorithms in a pickle file for later use. The best one (=model_LR) is used for the sentiment analysis of the election tweets in my other notebook.

In [None]:
file = open('model_LR.pickle','wb')
pickle.dump(model_LR, file)
file.close()

file = open('model_SVC.pickle','wb')
pickle.dump(model_SVC, file)
file.close()

file = open('model_RC.pickle','wb')
pickle.dump(model_RC, file)
file.close()