In [1]:
import pandas as pd
import numpy as np
from IPython.display import Image
from tqdm import tqdm_notebook as tqdm
import json
import matplotlib.pyplot as plt
import csv
# adjust dimensions of plot area to make it look better
plt.rcParams['figure.figsize'] = (15, 7)
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.preprocessing import Normalizer

In [2]:
def extract_data(filepath):
    dataset = []
    f = open(filepath)
    for line in f.readlines():
        edited_line = line.strip().replace('\'', '').lower()
        example = []
        example.append(edited_line[:-1].strip())
        example.append(int(edited_line[-1]))
        dataset.append(example)
    
    return dataset

In [3]:
"""
Extracting training datasets for Yelp and IMDB
"""

'\nExtracting training datasets for Yelp and IMDB\n'

In [4]:
yelp_train = extract_data('./hwk3_datasets/yelp-train.txt')
imdb_train = extract_data('./hwk3_datasets/IMDB-train.txt')

yelp_valid = extract_data('./hwk3_datasets/yelp-valid.txt')
imdb_valid = extract_data('./hwk3_datasets/IMDB-valid.txt')

yelp_test = extract_data('./hwk3_datasets/yelp-test.txt')
imdb_test = extract_data('./hwk3_datasets/IMDB-test.txt')

In [5]:
"""
Making dataframes for the train, valid and test sets of Yelp and IMDB
"""

'\nMaking dataframes for the train, valid and test sets of Yelp and IMDB\n'

In [6]:
yelp_columns = ['review', 'rating']
imdb_columns = ['review', 'sentiment']

yelp_train_df = pd.DataFrame(data=yelp_train, columns=yelp_columns)
imdb_train_df = pd.DataFrame(data=imdb_train, columns=imdb_columns)

yelp_valid_df = pd.DataFrame(data=yelp_valid, columns=yelp_columns)
imdb_valid_df = pd.DataFrame(data=imdb_valid, columns=imdb_columns)

yelp_test_df = pd.DataFrame(data=yelp_test, columns=yelp_columns)
imdb_test_df = pd.DataFrame(data=imdb_test, columns=imdb_columns)

In [7]:
"""
Storing the input (reviews) for the
train, valid and test sets of Yelp and IMDB
"""

'\nStoring the input (reviews) for the\ntrain, valid and test sets of Yelp and IMDB\n'

In [8]:
yelp_train_input = list(yelp_train_df['review'])
imdb_train_input = list(imdb_train_df['review'])

yelp_valid_input = list(yelp_valid_df['review'])
imdb_valid_input = list(imdb_valid_df['review'])

yelp_test_input = list(yelp_test_df['review'])
imdb_test_input = list(imdb_test_df['review'])

In [9]:
"""
Storing the rating (Yelp) and sentiment (IMDB)
for the train, valid and test sets of Yelp and IMDB
"""

'\nStoring the rating (Yelp) and sentiment (IMDB)\nfor the train, valid and test sets of Yelp and IMDB\n'

In [10]:
yelp_train_output = list(yelp_train_df['rating'])
imdb_train_output = list(imdb_train_df['sentiment'])

yelp_valid_output = list(yelp_valid_df['rating'])
imdb_valid_output = list(imdb_valid_df['sentiment'])

yelp_test_output = list(yelp_test_df['rating'])
imdb_test_output = list(imdb_test_df['sentiment'])

In [11]:
"""
Using a CountVectorizer will turn the
words into lowercase and remove the punctuations
"""

'\nUsing a CountVectorizer will turn the\nwords into lowercase and remove the punctuations\n'

In [12]:
yelp_vectorizer = CountVectorizer()
yelp_vectors_train = yelp_vectorizer.fit_transform(yelp_train_input)

imdb_vectorizer = CountVectorizer()
imdb_vectors_train = imdb_vectorizer.fit_transform(imdb_train_input)

In [21]:
yelp_vectorizer.vocabulary_

{'surface': 22148,
 'multiplied': 14895,
 'dishwashers': 6869,
 'patron': 16459,
 'kanyeoprah': 12425,
 'retired': 18914,
 'realtor': 18295,
 'whyd': 24960,
 'blooms': 2807,
 'unfiltered': 23877,
 'impressive': 11473,
 'toothpicks': 23205,
 'byob': 3595,
 'ragu': 18121,
 'dollop': 7058,
 'represent': 18755,
 'entry': 7903,
 'varying': 24238,
 'carrots': 3905,
 'pooches': 17251,
 'act': 700,
 'anywhwere': 1363,
 'agents': 896,
 'abs': 567,
 'cote': 5545,
 '7th': 458,
 'moreover': 14752,
 'russian': 19375,
 'tibs': 22985,
 'dixon': 6987,
 'availablecons': 1866,
 'ow': 16096,
 'velociraptor': 24280,
 'pocket': 17186,
 'molten': 14666,
 'contracts': 5356,
 'recipes': 18356,
 'created': 5732,
 'wolverine': 25128,
 'montgomery': 14723,
 'substitute': 21954,
 'claiming': 4625,
 'freshness': 9299,
 'costume': 5540,
 'unit': 23928,
 'careless': 3852,
 'ck': 4619,
 'showing': 20389,
 'quiet': 18037,
 'cross': 5822,
 'anticipate': 1324,
 'spins': 21306,
 'sodas': 20955,
 'cobb': 4829,
 'md': 1408

In [None]:
"""
Get the frequency of the
words in a vocabulary.

Return: A dictionary
which has the structure - 
    key = word
    value = [id, count]
"""

In [50]:
def get_vocab_frequencies(dataset_input, dataset_vectorizer):
    vectorizer_with_count = {}
    for example in dataset_input:
        words = example.split()
        for word in words:
            if word in dataset_vectorizer.vocabulary_:
                if word not in vectorizer_with_count:
                    vectorizer_with_count[word] = [dataset_vectorizer.vocabulary_[word], 1]
                else:
                    vectorizer_with_count[word][1] += 1
    
    return vectorizer_with_count

In [60]:
yelp_train_frequencies = get_vocab_frequencies(yelp_train_input, yelp_vectorizer)
imdb_train_frequencies = get_vocab_frequencies(imdb_train_input, imdb_vectorizer)

In [None]:
"""
Sorting the words in the dictionary
in descending order of frequencies.
Getting the top 10,000 words (words
with the highest frequencies). These
words form the feature set.
"""

In [62]:
yelp_feature_set = sorted(yelp_train_frequencies.items(), key=lambda kv: kv[1][1], reverse=True)[:10000]
imdb_feature_set = sorted(imdb_train_frequencies.items(), key=lambda kv: kv[1][1], reverse=True)[:10000]

In [66]:
yelp_feature_set[:10]

[('the', [22800, 45102]),
 ('and', [1229, 30082]),
 ('to', [23109, 20847]),
 ('of', [15606, 14335]),
 ('was', [24674, 13551]),
 ('is', [12005, 11825]),
 ('for', [9102, 10395]),
 ('it', [12025, 9881]),
 ('in', [11489, 9822]),
 ('that', [22791, 7793])]

In [None]:
"""
Making a new vectorizer for Yelp and IMDB.
The new vectorizer is based on the 10,000
words with the highest frequencies.

Transform the valid and test input according
to the vectorizer just found. Convert the result
into an array. Modify the array such that the
value along a dimension gets set to 1 if its
value is greater than zero else it gets set to 0.
"""

In [95]:
yelp_vectorizer_final = CountVectorizer()
yelp_temp_df = pd.DataFrame(data=yelp_feature_set, columns=["word", "id_and_count"])
yelp_vectors_train_final = yelp_vectorizer_final.fit_transform(list(yelp_temp_df["word"]))

yelp_valid_temp1 = yelp_vectorizer_final.transform(yelp_valid_input).toarray()
yelp_vectors_valid = (yelp_valid_temp1 > 0).astype(int)

yelp_test_temp1 = yelp_vectorizer_final.transform(yelp_test_input).toarray()
yelp_vectors_test = (yelp_test_temp1 > 0).astype(int)

In [96]:
imdb_vectorizer_final = CountVectorizer()
imdb_temp_df = pd.DataFrame(data=imdb_feature_set, columns=["word", "id_and_count"])
imdb_vectors_train_final = imdb_vectorizer_final.fit_transform(list(imdb_temp_df["word"]))

imdb_valid_temp1 = imdb_vectorizer_final.transform(imdb_valid_input).toarray()
imdb_vectors_valid = (imdb_valid_temp1 > 0).astype(int)

imdb_test_temp1 = imdb_vectorizer_final.transform(imdb_test_input).toarray()
imdb_vectors_test = (imdb_test_temp1 > 0).astype(int)

In [81]:
# yelp_temp_df = pd.DataFrame(data=yelp_feature_set, columns=["word", "id_and_count"])
# yelp_temp_df["id_and_count"][0]

[22800, 45102]

In [102]:
with open('yelp_vectors_valid.txt', 'w') as f:
    for example in yelp_vectors_valid:
        for num in example:
            f.write(str(num) + ', ')
        f.write('\n')

In [110]:
print(imdb_vectors_train_final.shape)

(10000, 10000)


In [93]:
type(yelp_vectors_train_final.toarray()[:10])

numpy.ndarray

In [111]:
type(yelp_vectors_valid)

numpy.ndarray