In [79]:
import pandas as pd
import json
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import gensim.downloader as api
import contractions
import re
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

In [36]:
categoty_file = './data/US_category_id.json'
video_file = './data/USvideos.csv'

In [37]:
raw_cat_df = pd.read_json(categoty_file)
raw_vid_df = pd.read_csv(video_file, on_bad_lines='skip')

In [38]:
json_struct = json.loads(raw_cat_df.to_json(orient="records"))    
category_flat = pd.json_normalize(json_struct) #use pd.io.json
categories = category_flat[['items.id', 'items.snippet.title']]

In [42]:
df = raw_vid_df[['title', 'category_id']].copy()
df = df.dropna()

df['category_id'] = df['category_id'].apply(lambda x: int(x))
df['title'] = df['title'] + ' ' + raw_vid_df['channel_title'] + ' ' + raw_vid_df['tags']

In [52]:
# clean data
def clean_data(corpus):
    # convert to lower case and remove space
    corpus['title'] = corpus['title'].apply(lambda x: x.lower().strip())

    # perform contractions
    corpus['title'] = corpus['title'].apply(lambda x: ' '.join(contractions.fix(word) for word in x.split()))

    # corpus[review] = corpus[review].apply(lambda x: re.sub(r'\b(?:not|never|no)\b[\w\s]+[^w\s]', 
    #                                                         lambda match: re.sub(r'(\s+)(\w+)', r'\1NOT_\2', match.group(0)),x))

    # remove non alphabet characters
    corpus['title'] = corpus['title'].apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))


    # remove extra space in between words
    corpus['title'] = corpus['title'].apply(lambda x: re.sub(' +', ' ', x))


In [61]:
# tokenize each word for each review
def tokenize_string(x):
    res = [word_tokenize(s) for s in x]
    return res

In [62]:
# function to preprocess words from the dataset that exist in the google database
def exist_in_google(X):
    reviews = tokenize_string(X)
    res = {}
    
    for review in reviews:
        for word in review:
            if word not in res:
                try:
                    res[word] = wv[word]
                except KeyError:
                    continue
    return res, reviews

In [70]:
# Find the Word2Vec features from the Google dataset given the Amazon reviews
def vectorize(X, y, exist_dict, tokens):
    X_new = []
    index_to_remove = []

    for idx, review in zip(X.index, tokens):
        total = [0,]
        length = 0
        for word in review:
            if word in exist_dict:
                total += exist_dict[word]
                length += 1
        if length > 0:
            X_new.append(total / length)
        else:
            index_to_remove.append(idx)
        
    # remove empty words from Y
    y_new = y.drop(labels=index_to_remove)
    
    return X_new, y_new           

In [64]:
clean_data(df)

In [65]:
wv = api.load('word2vec-google-news-300')

In [66]:
X = df['title']
y = df['category_id']

In [67]:
# returns a dict of all words that exist in the Google model as well as the tokenized reviews
exist_dict, tokens = exist_in_google(df['title'])

In [71]:
X_g, y_g = vectorize(X, y, exist_dict, tokens)

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X_g, y_g, test_size=0.2, random_state=42)

In [80]:
svm = LinearSVC(dual=False)
svm.fit(X_train, y_train)
svm_prediction = svm.predict(X_test)

print(classification_report(y_test, svm_prediction))

              precision    recall  f1-score   support

           1       0.73      0.60      0.66       463
           2       0.89      0.83      0.86        66
          10       0.86      0.92      0.89      1290
          15       0.84      0.97      0.90       172
          17       0.89      0.95      0.92       420
          19       0.91      0.88      0.90        83
          20       0.97      0.89      0.93       170
          22       0.61      0.41      0.49       618
          23       0.69      0.65      0.67       682
          24       0.70      0.72      0.71      2006
          25       0.87      0.88      0.87       510
          26       0.75      0.84      0.79       868
          27       0.79      0.82      0.81       336
          28       0.73      0.77      0.75       484
          29       1.00      0.73      0.84        11
          43       1.00      0.80      0.89        10

    accuracy                           0.77      8189
   macro avg       0.83   