In [10]:
import os
import json
import pandas as pd
import numpy as np
from re import sub, compile
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [11]:
# the location for tweets in json format
data_dir = "sanders data\\sanders\\rawdata" # you need to change this

# define a function that reads read id and text from each json file
def read_data(file_name):
    with open(file_name, "r") as in_file:
        status = json.loads(in_file.read())
        try:
            return [status["id"], status["text"]]
        except KeyError:
            return ["", ""]
    

# the os.lisdtdir method return a list of file name (not full path) in the specified directory
# the os.path.join joins a directory path and a file name to make a full file path
file_names = list(map(lambda x: os.path.join(data_dir, x), os.listdir(data_dir)))

# read id and text
data1 = list(map(read_data, file_names))

# convert the list of ids and texts to a data frame
df_data1 = pd.DataFrame(data1, columns=["id", "text"])

# convert id from interger to string
print(df_data1.head())

                   id                                               text
0  125082707389718529  Today very cheap for 2198597 Icemaker.And Free...
1  125085987431923713  @fashionNOGuilt haha! tomorrow should be less ...
2  125129328446017536                                DAMN YOU !!! @apple
3  125165176772247552                      Love love love iOS 5!! @apple
4  125184213342367744  Discount Hemp Knots today.Cheap price too.Save...


In [12]:
print("Missing data: %d" % len(df_data1[df_data1["id"] == ""]))
# remove missing data
df_data1 = df_data1[df_data1["id"] != ""]
print("Sample size: %d" % len(df_data1))

Missing data: 592
Sample size: 4921


In [13]:
# read the labels
df_labels = pd.read_csv("sanders data\\sanders\\corpus.csv",
                        names=["topic", "polarity", "id"])
print(df_labels.head())

   topic  polarity                  id
0  apple  positive  126415614616154112
1  apple  positive  126404574230740992
2  apple  positive  126402758403305474
3  apple  positive  126397179614068736
4  apple  positive  126395626979196928


In [14]:
# join labels with texts
print(type(df_labels["id"][0]))
print(type(df_data1["id"][0]))
df_data1["id"] = pd.to_numeric(df_data1["id"])
print(type(df_data1["id"][0]))
df_data2 = df_data1.merge(df_labels, on="id", how="left")
print(df_data2.head())

<class 'numpy.int64'>
<class 'int'>
<class 'numpy.int64'>
                   id                                               text  \
0  125082707389718529  Today very cheap for 2198597 Icemaker.And Free...   
1  125085987431923713  @fashionNOGuilt haha! tomorrow should be less ...   
2  125129328446017536                                DAMN YOU !!! @apple   
3  125165176772247552                      Love love love iOS 5!! @apple   
4  125184213342367744  Discount Hemp Knots today.Cheap price too.Save...   

   topic    polarity  
0  apple  irrelevant  
1  apple     neutral  
2  apple    negative  
3  apple    positive  
4  apple  irrelevant  


In [15]:
# check the distribution of topic and polarity
print(df_data2["topic"].value_counts())
print(df_data2["polarity"].value_counts())

microsoft    1304
google       1287
twitter      1219
apple        1111
Name: topic, dtype: int64
neutral       2244
irrelevant    1623
negative       548
positive       506
Name: polarity, dtype: int64


### Preprecessing

In [16]:
# define a custom tokenizer
def tokenization(text):
    # replace mention 
    text = sub("@[^ ]+", " ", text)
    # replace hashtags with space
    text = sub("#[^ ]+", " ", text)
    # replace RT (retweet) with space
    text = sub("RT", " ", text)
    # replace URL with space
    text = sub("http[^ ]+", " ", text)

    p = compile("[^a-z]")
    # conver the text to lower case and split by non-alphabetic characters
    # also remove "" due to tokenizing multple spaces
    return [token for token in p.split(text.lower()) if token != ""]
        
# test the tokenization function on a single text
tokenization(df_data2["text"].iloc[0])

['today',
 'very',
 'cheap',
 'for',
 'icemaker',
 'and',
 'free',
 'shipping',
 'for',
 'icemaker',
 'too']

In [17]:
from sklearn.linear_model import SGDClassifier
# 10-fold cross validation
skf = StratifiedKFold(n_splits=10)
fold = 0
f1 = []
for train_index, test_index in skf.split(df_data2["text"], df_data2["polarity"]):
#for train_index, test_index in skf:
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = df_data2["text"].iloc[train_index], df_data2["text"].iloc[test_index]
    train_y, test_y = df_data2["polarity"].iloc[train_index], df_data2["polarity"].iloc[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english')
    X = vectorizer.fit_transform(train_x)
    X_test = vectorizer.transform(test_x)
    # train model
    clf = SGDClassifier(random_state=fold)
    clf.fit(X, train_y)
    # predict
    pred_y = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred_y).split("\n"):
        print(line)
    f1.append(metrics.f1_score(test_y, pred_y, average='weighted'))
print("Average F1: %.2f" % np.mean(f1))

Fold 1
              precision    recall  f1-score   support

  irrelevant       0.94      0.66      0.77       163
    negative       0.33      0.51      0.40        55
     neutral       0.62      0.66      0.64       224
    positive       0.38      0.41      0.39        51

    accuracy                           0.62       493
   macro avg       0.57      0.56      0.55       493
weighted avg       0.67      0.62      0.63       493

Fold 2
              precision    recall  f1-score   support

  irrelevant       0.92      0.75      0.83       163
    negative       0.52      0.64      0.57        55
     neutral       0.67      0.69      0.68       224
    positive       0.33      0.42      0.37        50

    accuracy                           0.67       492
   macro avg       0.61      0.62      0.61       492
weighted avg       0.70      0.67      0.69       492

Fold 3
              precision    recall  f1-score   support

  irrelevant       0.92      0.70      0.79       163


### Using sentiment lexicon for classification

Download and unzip the opinion lexicon from http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html or the files is posted on D2l

In [18]:
# read the lexicon
lexicon = dict()

# read postive words
with open("opinion-lexicon-English\\negative-words.txt", "r") as in_file:
    for line in in_file.readlines():
        if not line.startswith(";") and line != "\n":
            lexicon[line.strip()] = -1

# read negative words
with open("opinion-lexicon-English\\positive-words.txt", "r") as in_file:
    for line in in_file.readlines():
        if not line.startswith(";") and line != "\n":
            lexicon[line.strip()] = 1

# print the top 5 entries
for i, (k, v) in enumerate(lexicon.items()):
    print(k, v)
    if i > 4: break

2-faced -1
2-faces -1
abnormal -1
abolish -1
abominable -1
abominably -1


In [19]:
# define a function that uses sentiment word voting to classify sentiment
def lexicon_classify(text):
    score = 0
    for token in tokenization(text):
        score += lexicon.get(token, 0)
    if score > 0: return "positive"
    elif score <0: return "negative"
    else: return "neutral"
    
# test the function on a single text
text = "@Apple: Siri is amazing!!! I'm in love!"
print(text)
print(lexicon_classify("@Apple: Siri is amazing!!! I'm in love!"))

# alternatively, you can return the score instead of a class using the function

@Apple: Siri is amazing!!! I'm in love!
positive


In [20]:
# before using the lexicon for classification, combine "neutral" and "irrelevant"
df_data2["polarity2"] = df_data2["polarity"].apply(lambda x: "neutral" if x=="irrelevant" else x)
df_data2["polarity2"].value_counts()

neutral     3867
negative     548
positive     506
Name: polarity2, dtype: int64

In [21]:
# classify using the lexicon
df_data2["lex_polarity"] = df_data2["text"].apply(lexicon_classify)
df_data2["lex_polarity"].value_counts()

neutral     3208
positive    1056
negative     657
Name: lex_polarity, dtype: int64

In [22]:
# measure the classification performance
for line in metrics.classification_report(df_data2["polarity2"], df_data2["lex_polarity"]).split("\n"):
    print(line)

              precision    recall  f1-score   support

    negative       0.38      0.46      0.41       548
     neutral       0.88      0.73      0.80      3867
    positive       0.29      0.59      0.39       506

    accuracy                           0.69      4921
   macro avg       0.52      0.59      0.53      4921
weighted avg       0.77      0.69      0.72      4921



In [23]:
# use svm for the 3-class classification
# 10-fold cross validation
skf = StratifiedKFold(n_splits=10)
fold = 0

# a container for f1 score
f1 = []
for train_index, test_index in skf.split(df_data2['text'], df_data2['polarity2']):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = df_data2["text"].loc[train_index], df_data2["text"].loc[test_index]
    train_y, test_y = df_data2["polarity2"].loc[train_index], df_data2["polarity2"].loc[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english')
    X = vectorizer.fit_transform(train_x)
    X_test = vectorizer.transform(test_x)
    # train model
    clf = SGDClassifier(random_state=fold)
    clf.fit(X, train_y)
    # predict
    pred_y = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred_y).split("\n"):
        print(line)
    f1.append(metrics.f1_score(test_y, pred_y, average='weighted'))
print("Average F1: %.2f" % np.mean(f1))

Fold 1
              precision    recall  f1-score   support

    negative       0.29      0.56      0.39        55
     neutral       0.89      0.71      0.79       387
    positive       0.29      0.45      0.36        51

    accuracy                           0.67       493
   macro avg       0.49      0.57      0.51       493
weighted avg       0.76      0.67      0.70       493

Fold 2
              precision    recall  f1-score   support

    negative       0.50      0.67      0.57        55
     neutral       0.88      0.84      0.86       387
    positive       0.40      0.38      0.39        50

    accuracy                           0.78       492
   macro avg       0.59      0.63      0.61       492
weighted avg       0.79      0.78      0.78       492

Fold 3
              precision    recall  f1-score   support

    negative       0.72      0.51      0.60        55
     neutral       0.86      0.90      0.88       387
    positive       0.23      0.22      0.23        50


#### Classifying neutral is easy. Let's take a look at the binary (positive vs. negative) classification performance.

In [24]:
# subset data
df_data3 = df_data2[df_data2["polarity2"] != "neutral"]
df_data3["polarity2"].value_counts()
df_data3["polarity2"] = df_data3["polarity2"].map({"positive":1, "negative":0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data3["polarity2"] = df_data3["polarity2"].map({"positive":1, "negative":0})


In [25]:
df_data3.head()

Unnamed: 0,id,text,topic,polarity,polarity2,lex_polarity
2,125129328446017536,DAMN YOU !!! @apple,apple,negative,0,negative
3,125165176772247552,Love love love iOS 5!! @apple,apple,positive,1,positive
7,125202037293064192,RT @gdcurry: Really @Apple? What have you don...,apple,negative,0,neutral
14,125223685194915840,@ford should have teamed up with @Apple instea...,apple,negative,0,positive
15,125224588253741056,#Siri went down for a little while last night....,apple,negative,0,negative


In [26]:
# 10-fold cross validation
skf = StratifiedKFold(n_splits=10)

fold = 0

# a container for f1 score
f1 = []
for train_index, test_index in skf.split(df_data3['text'], df_data3['polarity2']):
    fold += 1
    print("Fold %d" % fold)
    # partition
    train_x, test_x = df_data3["text"].iloc[train_index], df_data3["text"].iloc[test_index]
    train_y, test_y = df_data3["polarity2"].iloc[train_index], df_data3["polarity2"].iloc[test_index]
    # vectorize
    vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english')
    X = vectorizer.fit_transform(train_x)
    X_test = vectorizer.transform(test_x)
    # train model
    clf = SGDClassifier()
    clf.fit(X, train_y)
    # predict
    pred_y = clf.predict(X_test)
    # classification results
    for line in metrics.classification_report(test_y, pred_y).split("\n"):
        print(line)
    f1.append(metrics.f1_score(test_y, pred_y))
print("Average F1: %.2f" % np.mean(f1))

Fold 1
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        55
           1       0.82      0.71      0.76        51

    accuracy                           0.78       106
   macro avg       0.79      0.78      0.78       106
weighted avg       0.79      0.78      0.78       106

Fold 2
              precision    recall  f1-score   support

           0       0.75      0.96      0.84        55
           1       0.94      0.65      0.77        51

    accuracy                           0.81       106
   macro avg       0.84      0.81      0.80       106
weighted avg       0.84      0.81      0.81       106

Fold 3
              precision    recall  f1-score   support

           0       0.66      0.80      0.72        55
           1       0.72      0.55      0.62        51

    accuracy                           0.68       106
   macro avg       0.69      0.67      0.67       106
weighted avg       0.69      0.68      0.67       106

### We then try three methods for dimension reduction. 1. RFE feature selection

In [27]:
from sklearn.model_selection import train_test_split
vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english')
X = vectorizer.fit_transform(df_data3["text"])
# partition: train/test = 80/20
train_x, test_x, train_y, test_y = train_test_split(X, df_data3["polarity2"], test_size=0.2, stratify=df_data3["polarity2"], random_state=123)

In [28]:
# evaluate all features
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
clf = SGDClassifier()
rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(n_splits=2), scoring="f1")
rfecv.fit(train_x, train_y)

RFECV(cv=StratifiedKFold(n_splits=2, random_state=None, shuffle=False),
      estimator=SGDClassifier(), scoring='f1')

In [29]:
columns=vectorizer.get_feature_names()
selected_terms = [x for (x, y) in zip(columns, rfecv.support_) if y==True]
vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english', vocabulary=selected_terms)
X = vectorizer.fit_transform(df_data3["text"])
train_x, test_x, train_y, test_y = train_test_split(X, df_data3["polarity2"], test_size=0.2, stratify=df_data3["polarity2"], random_state=123)

In [30]:
clf = SGDClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
# classification results
for line in metrics.classification_report(test_y, pred_y).split("\n"):
    print(line)

              precision    recall  f1-score   support

           0       0.76      0.76      0.76       110
           1       0.74      0.74      0.74       101

    accuracy                           0.75       211
   macro avg       0.75      0.75      0.75       211
weighted avg       0.75      0.75      0.75       211



### 2. Using just sentiment terms in the dictionary

In [31]:
vocab = lexicon.keys()
vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english', vocabulary=vocab)
X = vectorizer.fit_transform(df_data3["text"])
train_x, test_x, train_y, test_y = train_test_split(X, df_data3["polarity2"], test_size=0.2, stratify=df_data3["polarity2"], random_state=123)
#test_x3 = keep_sentiment_terms(test_x)
clf = SGDClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
# classification results
for line in metrics.classification_report(test_y, pred_y).split("\n"):
    print(line)

              precision    recall  f1-score   support

           0       0.65      0.87      0.75       110
           1       0.78      0.50      0.61       101

    accuracy                           0.69       211
   macro avg       0.72      0.68      0.68       211
weighted avg       0.71      0.69      0.68       211



### 3. PCA

In [32]:

#X_std = StandardScaler().fit_transform(X) # you need to do standardization, since pca is sensitive to the relative scaling of the original variables
vectorizer = TfidfVectorizer(tokenizer=tokenization, max_df=0.8, stop_words='english')
X = vectorizer.fit_transform(df_data3["text"]).todense()
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)
print(len(X[0]))
from sklearn.decomposition import PCA
pca = PCA(svd_solver='randomized',whiten=True).fit(X)
print(pca.explained_variance_ratio_)
sumofvariance=0.0
n_components = 0
for item in pca.explained_variance_ratio_:
    sumofvariance += item
    n_components+=1
    if sumofvariance>=0.9:
        break
print(n_components)
pca = PCA(n_components=n_components, svd_solver='randomized',whiten=True).fit(X)
X_train_pca = pca.transform(X)
train_x, test_x, train_y, test_y = train_test_split(X_train_pca, df_data3["polarity2"], test_size=0.2, stratify=df_data3["polarity2"], random_state=123)
clf = SGDClassifier()
clf.fit(train_x, train_y)
pred_y = clf.predict(test_x)
for line in metrics.classification_report(test_y, pred_y).split("\n"):
    print(line)

2468
[4.71367798e-03 3.60271771e-03 3.51245306e-03 ... 1.94535059e-36
 6.74341370e-37 3.42949719e-37]
639
              precision    recall  f1-score   support

           0       0.73      0.75      0.74       110
           1       0.71      0.69      0.70       101

    accuracy                           0.72       211
   macro avg       0.72      0.72      0.72       211
weighted avg       0.72      0.72      0.72       211

