# Preprocess data

The **`sentiment`** data set consists of 3000 sentences which come from reviews on `imdb.com`, `amazon.com`, and `yelp.com`. Each sentence is labeled according to whether it comes from a positive review or negative review.

In [1]:
import string
import numpy as np
import pandas as pd

In [2]:
## Read in the data set.
with open("1_sentiment_dataset.txt") as f:
    content = f.readlines()
    
## Remove leading and trailing white space
content = [x.strip() for x in content]

## Separate the sentences from the labels
sentences = [x.split("\t")[0] for x in content]
labels = [x.split("\t")[1] for x in content]
y = np.array(labels, dtype='int8')
# Relabel from (0/1) to (-1/1)
y = 2*y -1

In [3]:
## full_remove takes a string x and a list of characters removal_list 
## returns x with all the characters in removal_list replaced by ' '
def full_remove(x, removal_list):
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

## Remove digits
digits = [str(x) for x in range(10)]
digit_less = [full_remove(x, digits) for x in sentences]

## Remove punctuation
punc_less = [full_remove(x, list(string.punctuation)) for x in digit_less]

## Make everything lower-case
sents_lower = [x.lower() for x in punc_less]

## Define our stop words
stop_set = set(['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from'])

## Remove stop words
sents_split = [x.split() for x in sents_lower]
sents_processed = [" ".join(list(filter(lambda a: a not in stop_set, x))) for x in sents_split]

In [4]:
sentences[0:10]

['So there is no way for me to plug it in here in the US unless I go by a converter.',
 'Good case, Excellent value.',
 'Great for the jawbone.',
 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
 'The mic is great.',
 'I have to jiggle the plug to get it to line up right to get decent volume.',
 'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
 'If you are Razr owner...you must have this!',
 'Needless to say, I wasted my money.',
 'What a waste of money and time!.']

In [5]:
sents_processed[0:10]

['so there is no way for me plug in here in us unless go by converter',
 'good case excellent value',
 'great for jawbone',
 'tied charger for conversations lasting more than minutes major problems',
 'mic is great',
 'have jiggle plug get line up right get decent volume',
 'if you have several dozen or several hundred contacts then imagine fun sending each them one by one',
 'if you are razr owner you must have this',
 'needless say wasted my money',
 'what waste money and time']

In [6]:
## Transform to bag of words representation.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 4500)
data_features = vectorizer.fit_transform(sents_processed)
X = data_features.toarray()
vocabulary = vectorizer.get_feature_names()
X_with_headers = pd.DataFrame(X,columns=vectorizer.get_feature_names())

In [7]:
X_with_headers

Unnamed: 0,aailiyah,abandoned,abhor,ability,able,abound,about,above,abroad,absolutel,...,your,yourself,youtube,yum,yummy,yun,zero,zillion,zombie,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
X.shape

(3000, 4500)

In [9]:
y.shape

(3000,)

In [10]:
X_with_headers.shape

(3000, 4500)

In [11]:
n = 0
print(sents_processed[n])
X_with_headers.iloc[:,list(np.where(X[n,:]>0)[0])]

so there is no way for me plug in here in us unless go by converter


Unnamed: 0,by,converter,for,go,here,in,is,me,no,plug,so,there,unless,us,way
0,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2996,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2997,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
2998,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Exercises

In [12]:
## Split the dataset (X and y) into testing and training sets (use test_size=0.2, random_state=100)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=100)

In [None]:
# Fit a k-nearest neighbor model to the training data
# Evaluate training and test error for different values of neighbors k
# However, this can take a lot of time to calculate for even small values of k...
from sklearn.neighbors import KNeighborsClassifier
max_k = 5
k = np.arange(1, max_k + 1, 1)
err_test = np.zeros(len(k))
err_train = np.zeros(len(k))
for i in k:
    clf = KNeighborsClassifier(n_neighbors=k[i-1])
    clf.fit(X_train, y_train)
    err_train[i-1] = 1 - clf.score(X_train, y_train)
    clf.fit(X_test, y_test)
    err_test[i-1] = 1 - clf.score(X_test, y_test)
plt.figure(figsize=[10,8])
plt.scatter(k, err_test)
plt.scatter(k, err_train)
plt.legend(["Test_error", "Train_error"])
plt.grid(True)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
k=1
clf = KNeighborsClassifier(n_neighbors=k)
clf.fit(X_train,y_train)
print(k, "{0:.2f} / {1:.2f}".format(clf.score(X_train, y_train),clf.score(X_test, y_test)))

1 1.00 / 0.68


In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {"n_neighbors": np.arange(1, 50)}
knn_cv = GridSearchCV(knn, param_grid, cv=5)
knn_cv.fit(X_train, y_train)
print(knn_cv.best_params_, knn_cv.best_score_)

In [None]:
for n in [1,2,3,4,5,7,9,15]:
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(X_train, y_train)
    print("Set accuracy for {} neighbors (train/test): {:.4f}/{:.4f}".format(n, clf.score(X_train, y_train), clf.score(X_test, y_test)))

In [14]:
# Fit a linear SVM model to the training data
# Evaluate training and test error for different values of regularization parameters C
from sklearn.svm import LinearSVC
for c in [1, 10, 100, 1000]:
    clf_SVC = LinearSVC(C=c) 
    clf_SVC.fit(X_train, y_train)
    print("Training score: {}".format(clf_SVC.score(X_train, y_train)))
    print("Test score: {}".format(clf_SVC.score(X_test, y_test)))
    print("Test error: {}".format(1-(clf_SVC.score(X_test, y_test))))

Training score: 0.9958333333333333
Test score: 0.8216666666666667
Test error: 0.17833333333333334




Training score: 0.9991666666666666
Test score: 0.7883333333333333
Test error: 0.21166666666666667




Training score: 0.9991666666666666
Test score: 0.765
Test error: 0.235
Training score: 0.9975
Test score: 0.775
Test error: 0.22499999999999998




In [16]:
# Fit a logistic regression model to the training data
# Evaluate training and test error for different values of regularization parameters C
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression(random_state=60, C=1)
clf_log.fit(X_train,y_train)
coefficients = pd.Series((clf_log.coef_).reshape(clf_log.coef_.shape[1],))    

In [18]:
aux = pd.DataFrame(pd.concat([pd.Series(X_with_headers.columns), coefficients],axis=1))
aux.columns=['word','coeff']
aux.sort_values(by='coeff', ascending=False, inplace=True)
top = aux.head(50)
bot = aux.tail(50)
print('\n top 50 \n')
display(top)
print('\n bot 50 \n')
display(bot)


 top 50 



Unnamed: 0,word,coeff
1737,great,3.290341
2632,nice,2.268036
2353,love,2.253906
1365,excellent,2.074384
119,amazing,1.930996
255,awesome,1.913781
1718,good,1.880899
358,best,1.805483
2354,loved,1.7956
1446,fantastic,1.704113



 bot 50 



Unnamed: 0,word,coeff
2663,nothing,-0.896598
2626,never,-0.898112
3491,shame,-0.901674
183,aren,-0.908134
2896,piece,-0.910898
167,appalling,-0.913821
4201,unreliable,-0.923778
4462,wouldn,-0.925374
220,at,-0.934384
3536,sick,-0.937175


In [19]:
# Get the coefficients of the logistic regression model with C=1 and random_state=60
# Display the feature names of the highest 50 (positive) coefficients
# Display the feature names of the lowest 50 (negative) coefficients
logreg = LogisticRegression(C=1, random_state=60)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=60)
logreg.fit(X_train, y_train)
logreg.coef_
ind_sorted = np.argsort(clf.coef_.reshape(4500))
ind_sorted
#vocabulary = vectorizer.get_feature_names()
high_pos = X_with_headers.columns[ind_sorted[:50]]
high_neg = X_with_headers.columns[ind_sorted[-50:]]
print(high_pos)
print(high_neg)

Index(['not', 'avoid', 'stupid', 'dirty', 'poor', 'appalling', 'return',
       'wasn', 'bad', 'doesn', 'average', 'dont', 'worst', 'rating', 'joke',
       'disappointment', 'stinks', 'unreliable', 'forced', 'directing',
       'torture', 'crap', 'slow', 'isn', 'bring', 'shame', 'hate', 'lame',
       'weak', 'sorry', 'starter', 'aren', 'dislike', 'unrecommended',
       'unpleasant', 'unacceptable', 'roasted', 'shameful', 'dissapointing',
       'awful', 'buying', 'nothing', 'displeased', 'disappointing', 'waste',
       'mistake', 'rated', 'mediocre', 'let', 'hours'],
      dtype='object')
Index(['cool', 'stunning', 'worthwhile', 'assure', 'soundtrack', 'portrayal',
       'carry', 'extended', 'bowl', 'animation', 'miss', 'remarkable',
       'sweetest', 'ordering', 'definitely', 'joy', 'best', 'comfortable',
       'funny', 'happy', 'easy', 'works', 'fun', 'beautiful', 'advise', 'game',
       'superb', 'must', 'fantastic', 'perfect', 'thumbs', 'good', 'enjoyed',
       'liked', 'i

In [15]:
from sklearn import svm
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel
#Train the model using the training sets
clf.fit(X_train, y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))
# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8233333333333334
Precision: 0.8260869565217391
Recall: 0.8205980066445183


In [12]:
## Split the dataset (X and y) into testing and training sets (use test_size=0.2, random_state=100)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [13]:
# Fit a k-nearest neighbor model to the training data
# Evaluate training and test error for different values of neighbors k
# However, this can take a lot of time to calculate for even small values of k...
from sklearn.neighbors import KNeighborsClassifier
for k in [1,3,5,10]:
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train,y_train)
    print(k, "{0:.2f} / {1:.2f}".format(clf.score(X_train, y_train),clf.score(X_test, y_test)))

1 1.00 / 0.68
3 0.80 / 0.66
5 0.76 / 0.64
10 0.73 / 0.67


In [14]:
# Fit a linear SVM model to the training data
# Evaluate training and test error for different values of regularization parameters C
from sklearn.svm import LinearSVC
for c in [0.01,0.1,1,10,100,1000]:
    clf = LinearSVC(C=c,max_iter=10000,random_state=100)
    clf.fit(X_train,y_train)
    print(c, "{0:.2f} / {1:.2f}".format(clf.score(X_train, y_train),clf.score(X_test, y_test)))

0.01 0.89 / 0.79
0.1 0.97 / 0.84
1 1.00 / 0.82
10 1.00 / 0.79
100 1.00 / 0.77
1000 1.00 / 0.77


In [15]:
# Fit a logistic regression model to the training data
# Evaluate training and test error for different values of regularization parameters C
from sklearn.linear_model import LogisticRegression
for c in [0.01,0.1,1,10,100,1000]:
    clf = LogisticRegression(C=c,max_iter=10000,random_state=60)
    clf.fit(X_train,y_train)
    print(c, "{0:.2f} / {1:.2f}".format(clf.score(X_train, y_train),clf.score(X_test, y_test)))

0.01 0.77 / 0.72
0.1 0.89 / 0.79
1 0.97 / 0.85
10 1.00 / 0.83
100 1.00 / 0.80
1000 1.00 / 0.79


In [16]:
# Get the coefficients of the logistic regression model with C=1 and random_state=60
# Display the feature names of the highest 50 (positive) coefficients
# Display the feature names of the lowest 50 (negative) coefficients
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1,max_iter=10000,random_state=60)
clf.fit(X_train,y_train)
ind_sorted = np.argsort(clf.coef_.reshape(4500))
vocabulary = np.array(vectorizer.get_feature_names())
print("Words with most negative coefficients")
print(vocabulary[ind_sorted[0:50]])
print("\n")
print("Words with most positive coefficients")
print(vocabulary[ind_sorted[-50:]])

Words with most negative coefficients
['bad' 'not' 'poor' 'worst' 'slow' 'terrible' 'stupid' 'awful'
 'disappointing' 'avoid' 'waste' 'wasn' 'horrible' 'disappointment'
 'fails' 'average' 'doesn' 'rude' 'crap' 'then' 'mediocre' 'bland'
 'return' 'plot' 'didn' 'disappointed' 'dont' 'sucked' 'dirty' 'don'
 'hate' 'old' 'hours' 'low' 'lame' 'worse' 'directing' 'failed' 'probably'
 'unfortunately' 'sick' 'at' 'wouldn' 'unreliable' 'appalling' 'piece'
 'aren' 'shame' 'never' 'nothing']


Words with most positive coefficients
['thumbs' 'world' 'friendly' 'played' 'glad' 'usually' 'restaurant'
 'brilliant' 'plus' 'exactly' 'soundtrack' 'watch' 'comfortable' 'must'
 'every' 'joy' 'years' 'family' 'fine' 'recommend' 'incredible' 'cool'
 'definitely' 'pleased' 'mouth' 'fun' 'wonderful' 'well' 'easy' 'funny'
 'interesting' 'happy' 'liked' 'enjoyed' 'game' 'perfect' 'works'
 'happier' 'beautiful' 'delicious' 'fantastic' 'loved' 'best' 'good'
 'awesome' 'amazing' 'excellent' 'love' 'nice' 'great']
