In [1]:
# Download data from the UCI sentiment labelled sentences dataset

import pandas as pd
filepath_dict = {'yelp': 'yelp_labelled.txt', 'amazon' : 'amazon_cells_labelled.txt', 'imdb' : 'imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
    
df = pd.concat(df_list)
print(df.iloc[0])

sentence    So there is no way for me to plug it in here i...
label                                                       0
source                                                 amazon
Name: 0, dtype: object


In [2]:
df.head()

Unnamed: 0,sentence,label,source
0,So there is no way for me to plug it in here i...,0,amazon
1,"Good case, Excellent value.",1,amazon
2,Great for the jawbone.,1,amazon
3,Tied to charger for conversations lasting more...,0,amazon
4,The mic is great.,1,amazon


In [3]:
# Now let's do some feature engineering
# Vectorize the sentences

sentences = ['John likes ice cream', 'John hates chocolate.']

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'chocolate': 1, 'cream': 2, 'hates': 3, 'ice': 4, 'likes': 5}

In [5]:
# create a topic vector
# Remember bag of words? Refresh your knowledge of the bag of words model
# Important terms are term frequency, Inverse log frequency, cosine similarity, Naive Bayes

vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [7]:
# split the data into training and testing set
# Let's start with the yelp dataset

from sklearn.model_selection import train_test_split
df_yelp = df[df['source'] == 'yelp']

sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)


from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [8]:
# We have 750 samples
# And 1714 (dimension) is the size of the vocabulary

# sparse matrix is a data type that is optimized for matrices with 
# only a few non-zero elements, which only keeps track of the non-zero elements reducing the memory load

# Use logistic regression for classification
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy: ", score)

Accuracy:  0.796


In [9]:
# Let's see how the logit model performs for our Yelp-Amazon-IMDB dataset

for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values
    
    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)
    
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test = vectorizer.transform(sentences_test)
    
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for amazon data: 0.7960
Accuracy for yelp data: 0.7960
Accuracy for imdb data: 0.7487


In [None]:
# This accuracy is fairly good
# Next we will build a Neural Network based model