# Sentiment Analysis

---
You are currently looking at **version 1.0** of this notebook.

---

*Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data.*

### Import libaries

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.notebook_repr_html', True)

### Import data

In [None]:
!ls data | head -5

In [None]:
# Read in the data
df = pd.read_csv('data/Amazon_Unlocked_Mobile.csv')

In [None]:
# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.3, random_state=10)
df.head()

In [None]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

In [None]:
# Most ratings are positive
df['Positively Rated'].mean()

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [None]:
X_train.iloc[0], X_train.shape

# CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [None]:
len(vect.get_feature_names()), 'features step 2000:', vect.get_feature_names()[::2000]

In [None]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

## Logistic regression
- classify reviews as positive or not

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [None]:
# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
# Find the N smallest and largest coefficients
def extreme_coeffs(feature_names, sorted_coef_index, N=10):
    print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:N]]))
    print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[::-1][:N]]))

In [None]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

extreme_coeffs(feature_names, sorted_coef_index, 5)

# Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

In [None]:
X_train_vectorized = vect.transform(X_train)
model = LogisticRegression().fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()
extreme_coeffs(feature_names, sorted_tfidf_index, 5)

In [None]:
sorted_coef_index = model.coef_[0].argsort()
extreme_coeffs(feature_names, sorted_coef_index, 5)

In [None]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

# n-grams

In [None]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(2, 3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [None]:
len(vect.get_feature_names()), vect.get_feature_names()[:10]

In [None]:
X_train_vectorized

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

extreme_coeffs(feature_names, sorted_coef_index)

In [None]:
# These reviews are now correctly identified
X_new = vect.transform(['not an issue, phone is working', 'an issue, phone is not working', 'not great', 'absolute shit', 'not epic'])
print(model.predict(X_new))