# Information Warfare
## Russia’s use of Twitter during the 2016 US Presidential Election
---

### Import libraries

In [1]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
get_ipython().config.get('IPKernelApp', {})['parent_appname'] = ""

import spacy
import os
import pickle

from collections import Counter

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as py

init_notebook_mode(connected=True)

### Import data

In [2]:
# All Tweets
df = pd.read_pickle('data/raw/tweets.pkl')
df.reset_index(drop = True, inplace = True)

# Only English language Tweets
dfEng = pd.read_pickle('data/raw/tweetsEng.pkl')
dfEng.reset_index(drop = True, inplace = True)

# Only non-English language Tweets
dfOth = pd.read_pickle('data/raw/tweetsOth.pkl')
dfOth.reset_index(drop = True, inplace = True)


### TBD

In [55]:
# All Tweets
print("All Tweets: \n {:,} rows of data, with {} observations for each row \n {:,} unique accounts \n \n Account types: {} \n Account categories: {} \n".format(df.shape[0], df.shape[1],len(set(df.author)),list(set(df.account_type)),list(set(df.account_category))))


All Tweets: 
 2,034,154 rows of data, with 21 observations for each row 
 463 unique accounts 
 
 Account types: ['Hashtager', '?', 'Left', 'Right', 'Arabic', 'Italian', 'news', 'Russian', 'Commercial', 'local'] 
 Account categories: ['Commercial', 'RightTroll', 'HashtagGamer', 'NewsFeed', 'Unknown', 'LeftTroll', 'NonEnglish'] 



In [56]:
# Only English language Tweets
print("English Tweets: \n {:,} rows of data, with {} observations for each row \n {:,} unique accounts \n \n Account types: {} \n \n Account categories: {} \n".format(dfEng.shape[0], dfEng.shape[1],len(set(dfEng.author)),list(set(dfEng.account_type)),list(set(dfEng.account_category))))


English Tweets: 
 2,116,867 rows of data, with 21 observations for each row 
 2,161 unique accounts 
 
 Account types: ['Hashtager', '?', 'French', 'Left', 'Right', 'Arabic', 'Italian', 'news', 'Russian', 'Commercial', 'ZAPOROSHIA', 'Spanish', 'German', 'Ebola ', 'Koch', 'Portuguese', 'local'] 
 
 Account categories: ['Commercial', 'Fearmonger', 'RightTroll', 'HashtagGamer', 'NewsFeed', 'Unknown', 'LeftTroll', 'NonEnglish'] 



In [57]:
# Only non-English language Tweets
print("Non-English Tweets: \n {:,} rows of data, with {} observations for each row \n {:,} unique accounts \n \n Account types: {} \n \n Account categories: {} \n".format(dfOth.shape[0], dfOth.shape[1],len(set(dfOth.author)),list(set(dfOth.account_type)),list(set(dfOth.account_category))))


Non-English Tweets: 
 829,340 rows of data, with 21 observations for each row 
 2,360 unique accounts 
 
 Account types: ['Ukranian', 'Hashtager', '?', 'French', 'Left', 'Right', 'Arabic', 'Italian', 'local', 'news', 'Russian', 'Commercial', 'ZAPOROSHIA', 'Uzbek', 'German', 'Ebola ', 'Koch', 'Portuguese', 'Spanish'] 
 
 Account categories: ['Commercial', 'Fearmonger', 'RightTroll', 'HashtagGamer', 'NewsFeed', 'Unknown', 'LeftTroll', 'NonEnglish'] 



There are two sets of labels that Linville and Warren have created, namely, account type and account name. Account type consists of more granular labels, while account category is more broad. For this exercise, I will focus on the account category set of labels. 

Before we move on, I should point out that the labeling scheme employed by Linville and Warren allows for accounts that primarily tweet in English to be labeled as Non English. For example, an account that primarily tweets about international events, such as the war in Ukraine, would be labeled as Non English, even if the text is in English. The graph below indicates that activity characterized by this behavior is limited. 

In [9]:
df = dfEng

In [10]:
counts_by_type = dict(Counter(df.account_category))

account_type = list(counts_by_type.keys())
values = list(counts_by_type.values())

account_type = [x for _,x in sorted(zip(values ,account_type), reverse = True)]
values = sorted(values, reverse = True)

other_color = ['Unknown', 'Commercial', 'NonEnglish', 'Fearmonger']

color_list = ['rgb(175, 88, 141)' if i in other_color else 'rgb(23,62,90)' for i in account_type]

data = [go.Bar(
            x=account_type,
            y=values,
            marker = dict(color = color_list),
            opacity = .7
    )]

layout = go.Layout(title = dict(text = 'Counts of Tweets by Category', font = dict(size = 30)), 
                   xaxis = dict(
                       tickangle = 45, 
                       tickfont = dict(size = 15),
                       automargin = True),
                  yaxis = dict(title = dict(text = 'Tweets', font = dict(size = 20))))

fig = go.Figure(data = data, layout = layout)

py.iplot(fig, filename='basic-bar')

The previous chart displayed information for overall tweets. However, what we are really interested in is activity per account. That is, it would be nice to know how many right trolls are in the dataset vs. left trolls and so on. Let's go ahead and visualize this information now. 

In [11]:
account_analysis = df.groupby(['author', 'account_category'])['content'].count().reset_index()

account_analysis = Counter(account_analysis.account_category)

account_type = list(account_analysis.keys())
values = list(account_analysis.values())

account_type = [x for _,x in sorted(zip(values ,account_type), reverse = True)]
values = sorted(values, reverse = True)

color_list = ['rgb(175, 88, 141)' if i in other_color else 'rgb(23,62,90)' for i in account_type]

data = [go.Bar(
            x=account_type,
            y=values,
            marker = dict(color = color_list),
            opacity = .7
    )]

layout = go.Layout(title = dict(text = 'Counts of Accounts by Category', font = dict(size = 30)), 
                   xaxis = dict(
                       tickangle = 45, 
                       tickfont = dict(size = 15),
                       automargin = True),
                  yaxis = dict(title = dict(text = 'Accounts', font = dict(size = 20))))

fig = go.Figure(data = data, layout = layout)

py.iplot(fig, filename='basic-bar')

Ok, this is an interesting chart. While Non English tweets account for very little of the overall tweet volume, there is a sizable number of Non English accounts in the dataset. 

For our procedure to work, we need to make sure that we have enough information (tweets) for each account for doc2vec to appropriatly embed our text in a vector space. As such, we need to ensure that we only consider accounts that meet some threshold for a minimum number of tweets, which we we do below. 

In [12]:
# Let's get counts of the number of tweets by each author
counts_by_author = df[['author', 'content']].groupby('author').count()

counts_by_author.reset_index(inplace = True)

print(sum(counts_by_author.content > 400))

author_series = counts_by_author.author[counts_by_author.content > 400]

df = df[df.author.isin(author_series)]

463


It should be noted that the choice of 400 tweets is arbitrary. Notice that by setting this threshold, the amount of Non English accounts in the dataset has decreased considerably. 

In [13]:
account_analysis = df.groupby(['author', 'account_category'])['content'].count().reset_index()
account_analysis = Counter(account_analysis.account_category)

account_type = list(account_analysis.keys())
values = list(account_analysis.values())

account_type = [x for _,x in sorted(zip(values ,account_type), reverse = True)]
values = sorted(values, reverse = True)

color_list = ['rgb(175, 88, 141)' if i == 'NonEnglish' else 'rgb(23,62,90)' for i in account_type]

data = [go.Bar(
            x=account_type,
            y=values,
            marker = dict(color = color_list),
            opacity = .7
    )]

layout = go.Layout(title = dict(text = 'Accounts with more than 400 Tweets', font = dict(size = 30)), 
                   xaxis = dict(
                       tickangle = 45, 
                       tickfont = dict(size = 15),
                       automargin = True),
                  yaxis = dict(title = dict(text = 'Accounts', font = dict(size = 20))))

fig = go.Figure(data = data, layout = layout)

py.iplot(fig, filename='basic-bar')

# Sampling and Pre-Processing

As you may know, natural language processing can be computationally expensive. Since we are dealing with a large number of tweets, it makes sense to take a sample of the data that we want to work with. 

Here, I take a 30% sample of tweets from each account. 

Note: the 30% sample size is arbitrary.

In [14]:
df.content = df.content.astype(str)

# Create the groupby object of author and category. Note: every author is assigned one category
df_sampled = df.groupby(['author', 'account_category'])['content']

# Take a 30% sample from each group
df_sampled = df_sampled.apply(lambda x: x.sample(frac=0.3, replace = False)).reset_index()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



You might be inclined to ask: why take a sample per account rather than just a sample of the overall dataset? When I first began expirimenting with this data, I tried to embed each individual tweet in a vector space. However, this procedure produced nonsensical results. It seems that there is not enough information contained in an individual tweet for us to gleen any useful unsights about the author, or discriminate between authors. 

However, if we were to combine tweets per author such that we have a single document for each author that is representative of everything that a particular author has ever tweeted, we will have more than enough information to give doc2vec a chance to work. 

There is just one problem: spaCy breaks if we try to parse long documents of text. As such, we will need to group our tweets by author after running the data through spaCy. 

Before we get to spaCy, we need to do some minor preprocessing. Here, I remove hyperlinks and and strip redundant whitepace from the text. I also expirimented with removing the RT symbol, but this did not seem to have any meaningful impact on our analysis. 

In [15]:
# SAMPLE AND PRE-CLEANING

# REMOVE RT SYMBOL
#from processing_functions import rt_remover
#df_sampled.content = df_sampled.content.apply(rt_remover)

# REMOVE ANY HYPERLINKS
from processing_functions import link_remover
df_sampled.content = df_sampled.content.apply(link_remover)


# STRIP ANY WHITESPACE ON EITHER SIDE OF THE TEXT
df_sampled.content = df_sampled.content.str.strip()

ModuleNotFoundError: No module named 'processing_functions'

Now that we have done some basic pre-processing, we can begin to parse and clean our text with spaCy.

# Tokenization, cleaning, and lemmatization with spaCy

- Note: this takes approximately 5 - 10 minutes to run.

In [None]:
from spacy.tokens import Token

# This allows us to add custom attributes to tokens, in this case, hashtags and accounts
Token.set_extension('is_hashtag', default = False, force = True)
Token.set_extension('is_account', default = False, force = True)

# These functions tell spaCy what should be considered a hashtage or account
from processing_functions import hashtag_pipe
from processing_functions import is_account_pipe

# We can  disable pipeline objects to save time: disable = ['parser', 'etc']
nlp = spacy.load("en", disable = ['parser', 'ner'])

# Here I add the two custom functions for hashtags and accounts to the pipeline
nlp.add_pipe(hashtag_pipe)
nlp.add_pipe(is_account_pipe)

# And we're off!
parsed_tweets = list(nlp.pipe(df_sampled.content))

# Gensim


Before we train our gensim doc2vec model, we need to do some more cleaning. The clean_doc function is located in the processing_functions.py file in this repository. 

Note: While I tried running the following without lemmatizing or removing stop words, I did not find that this made any meaningful difference in terms of the clusters that do or do not form.

In [None]:
from processing_functions import clean_doc
from processing_functions import clean_doc_no_lemma

lemma = list(map(clean_doc, parsed_tweets))
#no_lemma = list(map(clean_doc_no_lemma, parsed_tweets))

The next step is perhaps the most critical to this analysis. We need to group the parsed and cleaned tweets by author, so that for each author, we have a 30% sample of everything that they have ever tweeted in a single list (as discussed above). We can easily do this by applying our own function to a Pandas groupby object. 

In [None]:
# each row in parsed content is a list of lists
df_sampled['parsed_content'] = lemma

# lets flatten these so that each row only has one list
from processing_functions import group_lists
        
df_grouped = df_sampled.groupby(['author', 
                    'account_category'])['parsed_content'].apply(group_lists).reset_index()

# Convert the parsed content Series to a list for gensim
parsed_content = list(df_grouped.parsed_content)

Now that out data is clean and in the proper format, we can go ahead and create our gensim doc2vec model. It never ceases to amaze me that we can implement such a poweful algorithm in just a few lines of code. 

In [None]:
# create a doc2vec model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# parsed_content is the list of parsed text
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(parsed_content)]

# Vector size of 300
model = Doc2Vec(documents, vector_size=300, window=5, min_count=3, workers=6)

arr_list = []

for index in range(0, len(model.docvecs)):
    arr_list.append(model.docvecs[index])
    
vec_array = np.stack(arr_list)

# Visualization and clustering

Great! We've cleaned, tokenized, and embeded our textual data in a vector space. Let's try to visualize our data and see where we are at. 

Since we are working with a 300 dimension dataset, we need to employ a dimension reduction technique if we want to visualize some of the patterns in our data. t-SNE is a great algorithm for this purpose, and we will employ this technique in the space below. 

By reducing the number of dimensions with t-SNE, we can visualize our data on a two-dimensional canvas. In the following chart the color of each individual datapoint will correspond to a label manually assigned by Professors Warren and Linville. 

In [None]:
# Instantiate and fit the t-SNE model.
from sklearn.manifold import TSNE
tsne_35 = TSNE(n_components = 2, perplexity = 35, verbose = 0 , n_iter = 1000).fit_transform(vec_array)

color_list = ["#003f5c", "#ffa600", "#f95d6a", "#a05195", "#ff7c43", "#2f4b7c", "#665191", "#d45087"]

data = []
for idx, i in  enumerate(df_grouped.account_category.unique()):
    x = tsne_35[df_grouped.account_category == i, 0]
    y = tsne_35[df_grouped.account_category == i, 1]
    
    data.append(go.Scatter(x = x, y = y, mode = 'markers', name = i, marker = dict(color = color_list[idx])))

layout = go.Layout(title = dict(text = 't-SNE Scatter: Original Labels', font = dict(size = 30)), legend = dict(font = dict(size = 15)))

fig = go.Figure(data = data, layout = layout)

py.iplot(fig)

Amazing! Our accounts seperate into distinct clusters that are more or less in line with the labels assigned by Warren and Linville! This suggests that we will be able to use KMeans or GMM to cluster accounts in an unsupervised way. 

One particularly interesting result is that there appears to be two distinct groups of Right Trolls that Warren and Linville did not discriminate between. By analyzing the behavior of these groups and others, we may be able to gain additional insight into Russian tradecraft and the overall objective of the Russian Twitter campaign.

Let's move on to our clustering algorithms!

### Kmeans

One of the challenges of using KMeans or any clustering algorithms is how to choose an appropriate K. We can use a metric called inertia to help aid us in choosing how many clusters we should specify for our data. 

But what is inertia and how does it work? Inertia is essentially a measure of clustering quality. That is, good clustering has tight clusters, and samples in each cluster are bunched together. Inertia measures how spread out clusters are and uses the distance from each sample to the centroid of its cluster in its calculation. 

A good rule of thumb is to choose the "elbow" in the inertia plot, or the inflection point where inertia begins to decrease at a slower rate.

Special shout-out to Hugo from Datacamp for teaching me about how inertia can be used to choose k, as well as laying the foundation for nearly everything that I know in python.

In [None]:
from sklearn.cluster import KMeans

inertias = []
clusters = list(range(1,11))

for k in clusters:
    
    model = KMeans(n_clusters = k)
    
    model.fit(vec_array)
    
    inertias.append(model.inertia_)

In [None]:
data = [go.Scatter(x = clusters, y = inertias, mode = 'lines', marker = dict(color = "#003f5c")),
        go.Scatter(x = clusters, y = inertias, mode = 'markers', marker = dict(color = "#2f4b7c", size = 9))]

layout = go.Layout(title = dict(text = 'Inertia Plot', font = dict(size = 30)), showlegend = False)

fig = go.Figure(data = data, layout = layout)

py.iplot(fig)

The inertia plot suggests that a k of five may be appropriate for our data. Let's go ahead and instantiate a KMeans model with k equal to five.

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)

kmeans_labels = kmeans.fit_predict(vec_array)

In [None]:
color_list = ["#003f5c", "#ffa600", "#f95d6a", "#a05195", "#ff7c43", "#2f4b7c", "#665191", "#d45087"]

data = []
for idx, i in  enumerate(np.unique(kmeans_labels)):
    x = tsne_35[kmeans_labels == i, 0]
    y = tsne_35[kmeans_labels == i, 1]
    
    data.append(go.Scatter(x = x, y = y, mode = 'markers', name = str(i), marker = dict(color = color_list[idx])))

layout = go.Layout(title = dict(text = 't-SNE Scatter: KMeans Labels', font = dict(size = 30)), legend = dict(font = dict(size = 15)))

fig = go.Figure(data = data, layout = layout)

py.iplot(fig)

That looks great! I am actually surprised at how well KMeans seems to be performing on our data. 

KMeans is a discriminative algorithm, and can be thought of as a hard clustering technique. That is, a single element can only be assigned to one cluster. This may be problematic when analyzing text, as we are likely to see overlap between clusters and would naturally expect more ambiguity in our cluster assignments. For example, the ideological bent of a particular twitter acount may be unclear, and it may be difficult to place into one category or another based on the text alone. 

To account for uncertainty in cluster assignments, we could turn to a soft clustering method where we account for uncertainty in class assignment. For example, Gaussian Mixture Models (GMMs) are generative algorithms that provide a probabilistic way of doing soft clustering. 

GMMs can loosly be thought of as an extension of KMeans, except that instead of placeing centroids in random locations in space, we place probability distributions. We then use the Expectation Maximiazation algorithm to discover parameters for each probability distribution for our K sources, and move the distributions around until convergence.

GMMs allow us to calculate the probability that a sample belongs to a cluster. As such, each sample is assigned a probability that it belongs in each cluster. 

Let's go ahead and experiment with mixture models in the space below!

# Gaussian Mixture Models (Work in Progress)

I will be using Gaussian Mixture Models in the space below, although it is possible to use different kinds of probability distirbutions in mixture models. 

In [None]:
from sklearn.mixture import GaussianMixture as GMM

gmm = GMM(n_components=5).fit(vec_array)

gmm_labels = gmm.predict(vec_array)

In [None]:
color_list = ["#003f5c", "#ffa600", "#f95d6a", "#a05195", "#ff7c43", "#2f4b7c", "#665191", "#d45087"]

data = []
for idx, i in  enumerate(np.unique(gmm_labels)):
    x = tsne_35[gmm_labels == i, 0]
    y = tsne_35[gmm_labels == i, 1]
    
    data.append(go.Scatter(x = x, y = y, mode = 'markers', name = str(i), marker = dict(color = color_list[idx])))

layout = go.Layout(title = '2D representation of doc2vec clusters using GMM')

fig = go.Figure(data = data, layout = layout)

py.iplot(fig)

In [None]:
n_components = np.arange(1, 20)
models = [GMM(n, covariance_type='full', random_state=0).fit(vec_array)
          for n in n_components]

data = [go.Scatter(x = n_components, y = [m.bic(vec_array) for m in models], mode = 'lines', name = 'BIC'),
       go.Scatter(x = n_components, y = [m.aic(vec_array) for m in models], mode = 'lines', name = 'AIC')]

layout = go.Layout(xaxis = dict(title = 'n_components'))

py.iplot(data)

print('n_components associated with minimum AIC: ', 
      np.argmin([m.aic(vec_array) for m in models]) + 1)

In [None]:
gmm.predict_proba(vec_array)

Weird. I'm getting some results that I didn't expect, particularly how that class labels looks exactly the same as those from KMeans, and how the AIC/BIC chart looks. 

After thinking it over, I think that we may be falling victim to the curse of dimensionality. Since I am working with a relatively short and wide dataset, we might expect individual datapoints or even clusters of datapoints to be far apart in terms of distance. If this is the case, then the porbabilities that our model generates will be extremly low or extremly high, and this appears to be the case.

At any rate, I will be sticking with KMeans as I move forward. 

# Analysis

Now that we have completed our KMeans cluster assignments, it's time to see how they stack up against  Linville and Warrens labeling scheme.

In [None]:
df_grouped['kmeans_labels'] = kmeans_labels

df_grouped.head()

In [None]:
label_dict = {}

for row in df_grouped.iterrows():
    
    try: 
        label_dict[row[1].account_category].append(row[1].kmeans_labels)
    
    except:
        label_dict[row[1].account_category] = [row[1].kmeans_labels]

I want to compute the conditional probability of a cluster label given the label assigned by Linville and Warren. This is trickier than it seems, and I could not think of a better way to do this. 

In [None]:
choices = [0, 1, 2, 3, 4]

prob_df = pd.DataFrame()

for i in label_dict.keys():
    count = dict(Counter(label_dict[i]))
    length = sum(list(count.values()))

    prob_dict = {}

    for key in choices:
        try:
            prob_dict[key]= count[key]/length
        
        except:
            prob_dict[key]= 0
    
    row = pd.DataFrame(pd.Series(prob_dict)).T
    
    row.index = [i]
    
    prob_df = pd.concat([prob_df, row])

That was a lot of work!  We now have a dataframe of conditional probabilities. That is given a label by Warren and Linville, we have a conditional probability for each KMeans cluster label.  This is more clear if we view the dataframe. 

In [None]:
prob_df

The table above indicates that for the most part, our clusters align pretty well with Warren and Linville's labels. Let's go ahead and visualize this. 

In [None]:
prob_df_t = prob_df.T

colors = ["#003f5c", "#58508d", "#bc5090", "#ff6361", "#ffa600"]
data = []
for idx, label in enumerate(list(prob_df_t.columns)[:6]):  
    data.append(go.Bar(x = list(prob_df_t.index), y = list(prob_df_t[label]), 
                       marker =  dict(color = colors), opacity = 0.9))
    
fig = tools.make_subplots(rows=2, cols=3, subplot_titles=('LeftTroll', 'HashtagGamer','RightTroll', 
                                                          'NewsFeed', 'NonEnglish', 'Commercial'))
fig.append_trace(data[0], 1, 1)
fig.append_trace(data[1], 1, 2)
fig.append_trace(data[2], 1, 3)
fig.append_trace(data[3], 2, 1)
fig.append_trace(data[4], 2, 2)
fig.append_trace(data[5], 2, 3)

fig['layout'].update(height=700, 
                     title= dict(text = 'Conditional Probability of KMeans Assignment Given a Label', 
                                 font = dict(size = 27)),
                     showlegend = False)

for i in range(1, 7):
    fig['layout']['yaxis' + str(i)].update(range = [0, 1], nticks = 10)


py.iplot(fig)

# Modeling

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

First we need to check our class balance

In [None]:
y = df_grouped.account_category.copy()
x = np.copy(vec_array)

print(Counter(y))

Due to the Limited number of samples, I am going to classify NonEnglish, Commercial, and Unknown as Other.

In [None]:
other_list = ['NonEnglish','Commercial', 'Unknown']

y = y.apply(lambda x: 'Other' if x in other_list else x)

In [None]:
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE

x_train, x_test, y_train, y_test = train_test_split(x, y, stratify = y)

sm = SMOTE(random_state = 123, k_neighbors = 6)

x_res, y_res = sm.fit_resample(x_train, y_train)

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

logreg = OneVsRestClassifier(LogisticRegression(solver = 'liblinear'))

logreg.fit(x_res, y_res)

y_pred = logreg.predict(x_test)

In [None]:
print("Accuracy Score: ",accuracy_score(y_test, y_pred), "\n", "\n")
print(classification_report(y_test, y_pred))

### SVM

In [None]:
from sklearn.svm import SVC

svc = OneVsRestClassifier(SVC(gamma = 'auto'))

param_grid = {'estimator__C': [1, 3, 5, 7]}

grid_svc = GridSearchCV(estimator = svc, param_grid = param_grid, 
                        scoring = 'accuracy', n_jobs = -1, verbose = 0,
                       cv = 4)

grid_svc.fit(x_res, y_res)

y_pred_svc = grid_svc.predict(x_test)

print('SVM Accuracy Score: ',accuracy_score(y_test, y_pred_svc), "\n", "\n")

print(classification_report(y_test, y_pred_svc))

### Random Forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state = 123, n_estimators = 10)


params_rf = {'max_depth':[2,3,4,5,6],
             'min_samples_leaf': [0.04, 0.06, 0.08],
             'max_features': [0.2, 0.4, 0.6, 0.8],
             'criterion':['gini', 'entropy']}
             
grid_rf = GridSearchCV(estimator = rf, param_grid = params_rf, 
                       scoring = 'accuracy', cv= 10, n_jobs = -1, iid = True)


grid_rf.fit(x_res, y_res)

y_pred_rf = grid_rf.predict(x_test)

print("Accuracy Score: ", accuracy_score(y_test, y_pred_rf), "\n", "\n")
print(classification_report(y_test, y_pred_rf))

### XGboost

In [None]:
import xgboost as xgb

param_grid = {
  'learning_rate': np.arange(0.05, 1.05, .10),
  'n_estimators' : [50],
  'subsample' : np.arange(0.05, 1.05, .05),
  'max_depth': [2,4,6]
  }

xg_cl = xgb.XGBClassifier(objective = 'multi:softmax')

randomized_xg_cl = RandomizedSearchCV(estimator = xg_cl, 
                                      param_distributions = param_grid,
                                      n_iter = 5, 
                                      cv = 5,
                                      scoring = 'accuracy',
                                      n_jobs = -1,
                                      verbose = 0)

randomized_xg_cl.fit(x_res, y_res)

print("Best Accuracy Score Train CV: ", randomized_xg_cl.best_score_)

In [None]:
y_pred = randomized_xg_cl.predict(x_test)

print('XGBoost Test Accuracy: ', accuracy_score(y_test, y_pred), "\n", "\n")

print(classification_report(y_test, y_pred))