## lab16: machine learning intro

In [73]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [74]:

filepath = {
    "yelp": "yelp_labelled.txt",
    "amazon": "amazon_cells_labelled.txt",
    "imdb": "imdb_labelled.txt",
}
# reading from different data sources
data_list = []

for source, path in filepath.items():
    review_data = pd.read_csv(path, names = ["sentences", "label"], sep = "\t")
    review_data["source"] = source
    data_list.append(review_data)

# concatenate them into one data frame
review_data = pd.concat(data_list)
review_data


Unnamed: 0,sentences,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [75]:
# separate the data by the original source
yelp_data = review_data[review_data["source"] == "yelp"]
amazon_data = review_data[review_data["source"] == "amazon"]
imdb_data = review_data[review_data["source"] == "imdb"]


In [76]:
# reshape yelp data
yelp_X = yelp_data["sentences"].values
yelp_y = yelp_data["label"].values

# reshape amazon data
amazon_X = amazon_data["sentences"].values
amazon_y = amazon_data["label"].values


# reshape imdb data
imdb_X = imdb_data["sentences"].values
imdb_y = imdb_data["label"].values



In [77]:
# split yelp data
yelp_x_train, yelp_x_test, yelp_y_train, yelp_y_test = train_test_split(yelp_X, yelp_y, test_size = 0.3, random_state = 50)
# split amazon data
amazon_x_train, amazon_x_test, amazon_y_train, amazon_y_test = train_test_split(amazon_X, amazon_y, test_size = 0.3, random_state = 80)
# split imbd data
imdb_x_train, imdb_x_test, imdb_y_train, imdb_y_test = train_test_split(imdb_X, imdb_y, test_size = 0.3, random_state = 45)

In [78]:
# vectorizing yelp data

vectorize_yelp = CountVectorizer(min_df = 0, lowercase=False)
vectorize_yelp.fit(yelp_x_train)

yelp_X_train = vectorize_yelp.transform(yelp_x_train)
yelp_X_test = vectorize_yelp.transform(yelp_x_test)


# vectorizing amazon data

vectorize_amazon = CountVectorizer(min_df = 0, lowercase=False)
vectorize_amazon.fit(amazon_x_train)

amazon_X_train = vectorize_amazon.transform(amazon_x_train)
amazon_X_test = vectorize_amazon.transform(amazon_x_test)


# vectorizing imbd data

vectorize_imdb = CountVectorizer(min_df = 0, lowercase=False)
vectorize_imdb.fit(imdb_x_train)

imdb_X_train = vectorize_imdb.transform(imdb_x_train)
imdb_X_test = vectorize_imdb.transform(imdb_x_test)

In [79]:
# yelp classification model
yelp_classifier = LogisticRegression()
yelp_classifier.fit(yelp_X_train, yelp_y_train)
yelp_score = yelp_classifier.score (yelp_X_test, yelp_y_test)

# amazon classification model
amazon_classifier = LogisticRegression()
amazon_classifier.fit(amazon_X_train, amazon_y_train)
amazon_score = amazon_classifier.score (amazon_X_test, amazon_y_test)

# imbd classification model
imdb_classifier = LogisticRegression()
imdb_classifier.fit(imdb_X_train, imdb_y_train)
imdb_score = imdb_classifier.score (imdb_X_test, imdb_y_test)

print(f"""
Yelp Score: {yelp_score}
Amazon Score: {amazon_score}
IMDB Score: {imdb_score}
""")


Yelp Score: 0.73
Amazon Score: 0.81
IMDB Score: 0.72



In [80]:
yelp_test_sentence_1 = ["Cold food", "Slow service"]

prediction_1 = vectorize_yelp.transform(yelp_test_sentence_1)
yelp_classifier.predict(prediction_1)

array([0, 0])

In [81]:
yelp_test_sentence_2 = ["Great meal", "Properly cooked meat"]

prediction_2 = vectorize_yelp.transform(yelp_test_sentence_2)

yelp_classifier.predict(prediction_2)

array([1, 1])