# Packages

In [1]:
# system tools
import os
import sys
sys.path.append(os.path.join(".."))

# data munging tools
import pandas as pd
import classifier_utils as clf

# Machine learning stuff
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import ShuffleSplit
from sklearn import metrics
from sklearn import preprocessing

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

Reading in the data

In [4]:
filename_test = os.path.join("data", "Corona_test.csv")

DATA1 = pd.read_csv(filename_test, index_col=0)

In [5]:
filename_train = os.path.join("data", "Corona_train.csv")

DATA2 = pd.read_csv(filename_train, index_col=0)

In [6]:
# Merge the 2 df's
DATA = pd.concat([DATA1, DATA2])

In [7]:
DATA.shape

(44955, 5)

In [8]:
DATA2.sample(10)

Unnamed: 0_level_0,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
UserName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7664,52616,??????,18/03/2020,God spread small flu called #CoronavirusOutbre...,Extremely Negative
41562,86514,"Queensland, Australia",11/04/2020,Food delivery giants are resisting calls to cu...,Negative
7535,52487,Edmonton,18/03/2020,#storeclerks\r\r\n#coronavirus \r\r\n#COVID19 ...,Positive
11253,56205,UK,19/03/2020,@FootyNutty442 @JacJac66 They should be stayin...,Positive
8086,53038,Across Missouri,18/03/2020,"If you are a business owner, you are probably ...",Negative
40597,85549,"Florida, USA",10/04/2020,A Consumer Psychologist Details What Businesse...,Neutral
19640,64592,Australia,22/03/2020,#KathandKim predicted #toiletpaperpanic back i...,Neutral
28636,73588,"Los Angeles, CA",27/03/2020,#Work #Labor #Pandemic #Coronavirus #COVID?19 ...,Extremely Positive
21485,66437,,23/03/2020,@NBDR_CEO omg I just broke out in laughter ov...,Extremely Positive
43148,88100,,12/04/2020,@CNN In addition to the shortage of medical an...,Positive


Number in each sentiment category 

In [9]:
DATA["Sentiment"].value_counts()

Positive              12369
Negative              10958
Neutral                8332
Extremely Positive     7223
Extremely Negative     6073
Name: Sentiment, dtype: int64

Create balanced data

In [10]:
def balance(dataframe, n=500):
    """
    Create a balanced sample from imbalanced datasets.
    
    dataframe: 
        Pandas dataframe with a column called 'text' and one called 'label'
    n:         
        Number of samples from each Sentiment, defaults to 500
    """
    
    # Use pandas select a random bunch of examples from each label
    out = (dataframe.groupby('Sentiment', as_index=False)
            .apply(lambda x: x.sample(n=n))
            .reset_index(drop=True))
    
    return out

In [11]:
DATA_balanced = balance(DATA, 6000) # using the count of the smallest Sentiment category

In [12]:
DATA_balanced.shape

(30000, 5)

In [13]:
DATA_balanced["Sentiment"].value_counts()

Positive              6000
Negative              6000
Neutral               6000
Extremely Negative    6000
Extremely Positive    6000
Name: Sentiment, dtype: int64

extract info from DATA so we can use it 

In [14]:
tweet = DATA_balanced["OriginalTweet"]
sentiment = DATA_balanced["Sentiment"]

In [18]:
tweet[:10]

0    Hi our farmers are keeping the supermarket she...
1    Scammers are using illegal robocalls to pitch ...
2    #Coronavirus Ppl are getting into panic buying...
3    A man has been accused of exploiting peopleÂ’s...
4    Im genuinely worried for the future of our mil...
5    This is just one of many videos from my food s...
6    One forecast sees Brent crude going as low as ...
7    2020 so far\r\r\n?Australian Bush fires\r\r\n?...
8    YÂ’all people need to stop panic buying! How m...
9    Just dropped my daughter off for a shift at Mc...
Name: OriginalTweet, dtype: object

In [19]:
DATA_balanced.sample(100)

Unnamed: 0,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
3981,67660,Utah,24/03/2020,for Testing for 19 is only conducted in verifi...,Extremely Negative
2328,71295,,25/03/2020,At the next State of the Union address I want ...,Extremely Negative
21187,69531,"NJ, USA",25/03/2020,More online grocery orders More buying in bulk...,Neutral
20062,57087,Hampshire and Beyond,19/03/2020,Sainsbury's packed with customers for Over 70s...,Neutral
16996,78072,United States,05/04/2020,Going to the supermarket was so strange. And t...,Negative
...,...,...,...,...,...
28626,69024,"Worcester, Massachusetts",24/03/2020,"You're reading this on Twitter, so we're going...",Positive
15682,49617,"Sheffield, the North",17/03/2020,A pledge; no supermarket beer until #COVID2019...,Negative
2290,88867,,13/04/2020,New on the blog....Panic Buying: Combating Foo...,Extremely Negative
2142,77585,"Warrenton, Virginia",05/04/2020,We went for a drive. We are all fucked. Nobody...,Extremely Negative


Train test set

In [20]:
X_train, X_test, y_train, y_test = train_test_split(tweet,           # texts for the model
                                                    sentiment,          # classification labels
                                                    test_size=0.2,   # create an 80/20 split
                                                    random_state=42) # random state for reproducibility

In [21]:
y_test.value_counts()

Extremely Positive    1244
Negative              1216
Positive              1190
Extremely Negative    1179
Neutral               1171
Name: Sentiment, dtype: int64

In [22]:
y_train.value_counts()

Neutral               4829
Extremely Negative    4821
Positive              4810
Negative              4784
Extremely Positive    4756
Name: Sentiment, dtype: int64

__Vectorizing and Feature Extraction__

In [44]:
vectorizer = TfidfVectorizer(ngram_range = (1,2),     # unigrams and bigrams (1 word and 2 word units)
                             lowercase = True,       # why use lowercase?
                             max_df = 0.95,           # remove very common words
                             min_df = 0.05,           # remove very rare words
                             max_features = 1000)      # keep only top 500 features

# This vectorizer is then used to turn all of our documents into a vector of numbers, instead of text.

In [45]:
# First we do it for our training data...
X_train_feats = vectorizer.fit_transform(X_train)
#... then we do it for our test data
X_test_feats = vectorizer.transform(X_test)
# We can also create a list of the feature names. 
feature_names = vectorizer.get_feature_names()

In [53]:
print(X_train_feats[0])

  (0, 34)	0.30808155865826475
  (0, 13)	0.307182740641691
  (0, 33)	0.3078486651083938
  (0, 60)	0.6048243429868285
  (0, 12)	0.5917273279037271


In [59]:
print(feature_names[:100])

['19', 'about', 'all', 'amp', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'can', 'co', 'consumer', 'coronavirus', 'covid', 'covid 19', 'covid19', 'covid_19', 'do', 'during', 'food', 'for', 'from', 'get', 'go', 'grocery', 'grocery store', 'has', 'have', 'home', 'how', 'https', 'https co', 'if', 'in', 'in the', 'is', 'it', 'just', 'like', 'more', 'my', 'need', 'no', 'not', 'now', 'of', 'of the', 'on', 'online', 'or', 'our', 'out', 'pandemic', 'panic', 'people', 'prices', 'sanitizer', 'shopping', 'so', 'some', 'stock', 'store', 'supermarket', 'that', 'the', 'the coronavirus', 'the grocery', 'their', 'there', 'they', 'this', 'time', 'to', 'to the', 'up', 'us', 'was', 'we', 'what', 'when', 'who', 'will', 'with', 'you', 'your']


In [54]:
sc = preprocessing.StandardScaler(with_mean=False)
X_scaled = sc.fit_transform(X_train_feats)

In [61]:
print(X_train_feats)

  (0, 34)	0.30808155865826475
  (0, 13)	0.307182740641691
  (0, 33)	0.3078486651083938
  (0, 60)	0.6048243429868285
  (0, 12)	0.5917273279037271
  (1, 76)	0.27156276327140116
  (1, 7)	0.24558993681778626
  (1, 6)	0.19511260461427565
  (1, 66)	0.23405263431421544
  (1, 65)	0.2211906081309274
  (1, 26)	0.31138847297666156
  (1, 75)	0.3651488888546413
  (1, 48)	0.1525297738013317
  (1, 31)	0.3115114032410228
  (1, 23)	0.1798471518030512
  (1, 3)	0.26273588237999596
  (1, 36)	0.1586504496723947
  (1, 67)	0.235462975895514
  (1, 19)	0.25628222719195326
  (1, 24)	0.2484015493653924
  (1, 2)	0.2533602003520663
  (2, 30)	1.0
  (3, 28)	0.1664308146271363
  (3, 15)	0.10001809443088566
  (3, 64)	0.1487923921669153
  :	:
  (23997, 48)	0.1432851801033097
  (23997, 24)	0.4666926312382781
  (23998, 22)	0.28308760096186847
  (23998, 37)	0.352431148231316
  (23998, 40)	0.3806266675984466
  (23998, 56)	0.3652366406841327
  (23998, 17)	0.23503401326549742
  (23998, 0)	0.2271201072060144
  (23998, 16)	0.2

__Classifying and predicting__

In [65]:
classifier = LogisticRegression(random_state=42).fit(X_train_feats, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [66]:
sc = preprocessing.StandardScaler(with_mean=False)
X_scaled = sc.fit_transform(X_train_feats)

In [67]:
classifier = LogisticRegression(random_state=42).fit(X_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
