# Packages

In [92]:
# system tools
import os
import sys
sys.path.append(os.path.join(".."))

# data munging tools
import pandas as pd
import classifier_utils as clf

# Machine learning stuff
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics
from sklearn import preprocessing

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

Reading in the data

In [32]:
filename_test = os.path.join("data", "Corona_test.csv")

DATA1 = pd.read_csv(filename_test, index_col=0)

In [33]:
filename_train = os.path.join("data", "Corona_train.csv")

DATA2 = pd.read_csv(filename_train, index_col=0)

In [45]:
# Merge the 2 df's
DATA = pd.concat([DATA1, DATA2])

In [46]:
DATA.shape

(44955, 5)

In [49]:
DATA2.sample(10)

Unnamed: 0_level_0,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
UserName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
32831,77783,"Ontario, Canada",05/04/2020,"I have always, quietly, declined to use self-c...",Extremely Positive
20717,65669,"Los Angeles, CA",23/03/2020,"Russian Demand For Consumer Credit, Mortgages ...",Negative
11045,55997,,19/03/2020,Bets on when albuterol inhaler prices will be ...,Positive
6839,51791,,18/03/2020,#Saudi_Arabia cuts 2020 budget expenditure by ...,Negative
35985,80937,Australia,07/04/2020,PM fights for his health in intensive care as ...,Positive
10111,55063,750' in Elv.NW NJ,19/03/2020,"Hay @GovMurphy and state of NJ , the gas tax i...",Positive
43073,88025,Muskoka aka Hollywood North,12/04/2020,BUY a house just before the #coronavirus #pand...,Neutral
33136,78088,Atlanta area,05/04/2020,who through incredible selfishness is keeping ...,Positive
23083,68035,Manila/Ibiza,24/03/2020,"In my two hours at the grocery, I have not see...",Negative
22247,67199,,23/03/2020,"During these times, people rely on online shop...",Negative


Number in each sentiment category 

In [50]:
DATA["Sentiment"].value_counts()

Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: Sentiment, dtype: int64

Create balanced data

In [60]:
def balance(dataframe, n=500):
    """
    Create a balanced sample from imbalanced datasets.
    
    dataframe: 
        Pandas dataframe with a column called 'text' and one called 'label'
    n:         
        Number of samples from each Sentiment, defaults to 500
    """
    
    # Use pandas select a random bunch of examples from each label
    out = (dataframe.groupby('Sentiment', as_index=False)
            .apply(lambda x: x.sample(n=n))
            .reset_index(drop=True))
    
    return out

In [61]:
DATA_balanced = balance(DATA, 6000) # using the count of the smallest Sentiment category

In [62]:
DATA_balanced.shape

(30000, 5)

In [63]:
DATA_balanced["Sentiment"].value_counts()

Neutral               6000
Negative              6000
Extremely Negative    6000
Positive              6000
Extremely Positive    6000
Name: Sentiment, dtype: int64

extract info from DATA so we can use it 

In [64]:
tweet = DATA_balanced["OriginalTweet"]
sentiment = DATA_balanced["Sentiment"]

In [66]:
DATA_balanced.sample(100)

Unnamed: 0,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
21157,69696,Hand Sanitiser Hotel,25/03/2020,People still taking no notice of lockdown mini...,Neutral
4409,84851,Between Breakfast And Bedtime,09/04/2020,Evil that people would do this deliberately. \...,Extremely Negative
29929,65071,"Cape Town, South Africa",22/03/2020,@AbsaSouthAfrica @Absa Standard Bank announced...,Positive
17162,54989,Austin TX United States,19/03/2020,Bought some of that cheap gas outside Houston ...,Negative
16826,79047,,06/04/2020,Â• We have updated some of our shipping prices...,Negative
...,...,...,...,...,...
3934,67154,,23/03/2020,Is Covid 19. The tip of the ice berg for pande...,Extremely Negative
6380,77184,,04/04/2020,Love seeing people wearing gloves at the super...,Extremely Positive
17235,53092,"Montreal, QC",18/03/2020,#Ulta #shutters #stores shortly after cutting ...,Negative
5391,56132,"London, UK",19/03/2020,Farage says it as it is. ChinaÂ’s appalling tr...,Extremely Negative


Train test set

In [112]:
X_train, X_test, y_train, y_test = train_test_split(tweet,           # texts for the model
                                                    sentiment,          # classification labels
                                                    test_size=0.2,   # create an 80/20 split
                                                    random_state=42) # random state for reproducibility

In [113]:
y_test.value_counts()

Extremely Positive    1244
Negative              1216
Positive              1190
Extremely Negative    1179
Neutral               1171
Name: Sentiment, dtype: int64

In [114]:
y_train.value_counts()

Neutral               4829
Extremely Negative    4821
Positive              4810
Negative              4784
Extremely Positive    4756
Name: Sentiment, dtype: int64

__Vectorizing and Feature Extraction__

In [115]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),     # unigrams and bigrams (1 word and 2 word units)
                             lowercase =  True,       # why use lowercase?
                             max_df = 0.95,           # remove very common words
                             min_df = 0.05,           # remove very rare words
                             max_features = 500)      # keep only top 500 features

# This vectorizer is then used to turn all of our documents into a vector of numbers, instead of text.

In [116]:
# First we do it for our training data...
X_train_feats = vectorizer.fit_transform(X_train)
#... then we do it for our test data
X_test_feats = vectorizer.transform(X_test)
# We can also create a list of the feature names. 
feature_names = vectorizer.get_feature_names()

In [117]:
sc = preprocessing.StandardScaler(with_mean=False)
X_scaled = sc.fit_transform(X_train_feats)

__Classifying and predicting__

In [118]:
classifier = LogisticRegression(random_state=42).fit(X_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
