# Lab : Machine Learning Intro 
## Data Set Name : Sentiment Labelled Sentences Data Set
## Author : Basel Atalla
## Date : 27/06/2021

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
filepath_dict = {'yelp':   'data/sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'data/sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'data/sentiment_analysis/imdb_labelled.txt'}


In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)
df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [4]:
sentences = ['John likes ice cream', 'John hates chocolate']

In [5]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [6]:
vectorizer.transform(sentences).toarray()


array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [7]:
df_yelp = df[df['source'] == 'yelp']
df_amazon = df[df['source'] == 'amazon']
df_imdb = df[df['source'] == 'imdb']

In [8]:
yelp_sentences = df_yelp['sentence'].values
amazon_sentences = df_amazon['sentence'].values
imdb_sentences = df_imdb['sentence'].values

In [9]:
y_yelp = df_yelp['label'].values
y_amazon = df_amazon['label'].values
y_imdb = df_imdb['label'].values

In [10]:
yelp_sentences_train, yelp_sentences_test, y_yelp_train, y_yelp_test = train_test_split(yelp_sentences, y_yelp, test_size=0.25, random_state=1000)

In [11]:

amazon_sentences_train, amazon_sentences_test, y_amazon_train, y_amazon_test = train_test_split(amazon_sentences, y_amazon, test_size=0.25, random_state=1000)

In [12]:

imdb_sentences_train, imdb_sentences_test, y_imdb_train, y_imdb_test = train_test_split(imdb_sentences, y_imdb, test_size=0.25, random_state=1000)

In [13]:
vectorizer_yelp = CountVectorizer()
vectorizer_yelp.fit(yelp_sentences_train)

X_yelp_train = vectorizer_yelp.transform(yelp_sentences_train)
X_yelp_test  = vectorizer_yelp.transform(yelp_sentences_test)
X_yelp_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [14]:
vectorizer_amazon = CountVectorizer()
vectorizer_amazon.fit(amazon_sentences_train)

X_amazon_train = vectorizer_amazon.transform(amazon_sentences_train)
X_amazon_test  = vectorizer_amazon.transform(amazon_sentences_test)
X_amazon_train

<750x1546 sparse matrix of type '<class 'numpy.int64'>'
	with 6817 stored elements in Compressed Sparse Row format>

In [15]:
vectorizer_imdb = CountVectorizer()
vectorizer_imdb.fit(imdb_sentences_train)

X_imdb_train = vectorizer_imdb.transform(imdb_sentences_train)
X_imdb_test  = vectorizer_imdb.transform(imdb_sentences_test)
X_imdb_train

<561x2505 sparse matrix of type '<class 'numpy.int64'>'
	with 8413 stored elements in Compressed Sparse Row format>

In [16]:
classifier_yelp = LogisticRegression()
classifier_yelp.fit(X_yelp_train, y_yelp_train)
score_yelp = classifier_yelp.score(X_yelp_test, y_yelp_test)

print("Accuracy:", score_yelp)

Accuracy: 0.796


In [17]:
classifier_amazon = LogisticRegression()
classifier_amazon.fit(X_amazon_train, y_amazon_train)
score_amazon = classifier_amazon.score(X_amazon_test, y_amazon_test)

print("Accuracy:", score_amazon)

Accuracy: 0.796


In [18]:
classifier_imdb = LogisticRegression()
classifier_imdb.fit(X_imdb_train, y_imdb_train)
score_imdb = classifier_imdb.score(X_imdb_test, y_imdb_test)

print("Accuracy:", score_imdb)

Accuracy: 0.7486631016042781
