In [87]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from json import load
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

## Read CSV and JSON

In [53]:
def readCSV(dir):
    df = pd.read_csv(dir, error_bad_lines = False)
    return df

csvDF = readCSV("./USvideos.csv")

# takes in the YouTube categories JSON file and returns a dictionary
# where each key is a category ID and each value is a category
def readJSON(dir):
    categoryDictionary = {}
    
    f = open(dir)
    jsonDictionary = load(f)
    # iterate through all the categories in the JSON dictionary
    for category in jsonDictionary["items"]:
        currentID = int(category["id"])
        currentCategory = category["snippet"]["title"]
        categoryDictionary[currentID] = currentCategory
    f.close()
    return categoryDictionary

csvDF = readCSV("./USvideos.csv")
jsonCategories = readJSON("./US_category_id.json")
jsonCategories



  csvDF = readCSV("./USvideos.csv")
b'Skipping line 2401: expected 11 fields, saw 21\nSkipping line 2800: expected 11 fields, saw 21\nSkipping line 5297: expected 11 fields, saw 12\nSkipping line 5299: expected 11 fields, saw 12\nSkipping line 5300: expected 11 fields, saw 12\nSkipping line 5301: expected 11 fields, saw 12\n'


  csvDF = readCSV("./USvideos.csv")
b'Skipping line 2401: expected 11 fields, saw 21\nSkipping line 2800: expected 11 fields, saw 21\nSkipping line 5297: expected 11 fields, saw 12\nSkipping line 5299: expected 11 fields, saw 12\nSkipping line 5300: expected 11 fields, saw 12\nSkipping line 5301: expected 11 fields, saw 12\n'


{1: 'Film & Animation',
 2: 'Autos & Vehicles',
 10: 'Music',
 15: 'Pets & Animals',
 17: 'Sports',
 18: 'Short Movies',
 19: 'Travel & Events',
 20: 'Gaming',
 21: 'Videoblogging',
 22: 'People & Blogs',
 23: 'Comedy',
 24: 'Entertainment',
 25: 'News & Politics',
 26: 'Howto & Style',
 27: 'Education',
 28: 'Science & Technology',
 29: 'Nonprofits & Activism',
 30: 'Movies',
 31: 'Anime/Animation',
 32: 'Action/Adventure',
 33: 'Classics',
 34: 'Comedy',
 35: 'Documentary',
 36: 'Drama',
 37: 'Family',
 38: 'Foreign',
 39: 'Horror',
 40: 'Sci-Fi/Fantasy',
 41: 'Thriller',
 42: 'Shorts',
 43: 'Shows',
 44: 'Trailers'}

In [79]:
def categorizeDF(csvDF, jsonCategories):
    csvDF["category"] = csvDF["category_id"].map(lambda x: jsonCategories[x])
    csvDF["tagsCleaned"] = csvDF["tags"].map(lambda x: x.replace("|", " "))
    csvDF["tagsCleaned"] = csvDF["tagsCleaned"].map(lambda x: x.replace("[none]", ""))
    csvDF["titlechanneltags"] = csvDF["title"] + " " + csvDF["channel_title"] + " " + csvDF["tagsCleaned"]
    filteredDF = csvDF.filter(["titlechanneltags"], axis = 1)
    return filteredDF
    
filteredDF = categorizeDF(csvDF, jsonCategories)
classDF = csvDF["category_id"]
classDF

0       24
1       28
2       22
3       28
4       23
        ..
7987    27
7988    25
7989    10
7990    24
7991    28
Name: category_id, Length: 7992, dtype: int64

## Split Testing and Training Data

In [80]:
# takes in the reduced dataframe and the class dataframe and splits it for test and train split
def splitDF(dfX, dfY):
    return train_test_split(dfX, dfY, train_size = 0.8)

trainX, testX, trainY, testY = splitDF(filteredDF, classDF)

## Load Gensim's Pre-Trained Model: word2vec-google-news-300

In [67]:
wv = api.load('word2vec-google-news-300')

## Vectorize the review bodies

In [82]:
# helper function with takes as input an array of word tokens and the word vectors 
# dictionary, and outputs a word vector for that entire review
def reviewVectorizer(review_body_arr, wordVectors):
    total, wordCount = 0, 0
    for word in review_body_arr:
        try:
            total += wordVectors[word]
            wordCount += 1
        except: continue
    if wordCount == 0: return [0] * 300
    featureValue = total / len(review_body_arr)
    return featureValue

# takes in as input the training or test data, the word vectors dictionary,
# a parameter for whether or not to tokenize the data, and an optional filename
# in case outputting to memory for preprocessing purposes and
# returns a dataframe with the vectors corresponding to each review
def word2VecCSV(dataFile, wordVector):
    dataFile["titlechanneltags_tokens"] = dataFile["titlechanneltags"].apply(lambda x: x.split(" "))
    dataFile["titlechanneltagsWordVector"] = dataFile["titlechanneltags_tokens"].apply(lambda x: reviewVectorizer(x, wordVector))
    colDF = pd.DataFrame(dataFile["titlechanneltagsWordVector"].to_list())
    return colDF

perceptronTrainX = word2VecCSV(trainX, wv)
perceptronTestX  = word2VecCSV(testX, wv)

## Perceptron Model

In [84]:
# takes as input the review data, and performs a perceptron
# model analysis and prediction on it, returning the results
# dataframe
def perceptron(trainX, trainY, testX, testY):
    model = Perceptron(tol = 0.001, random_state = 0)
    results = model.fit(trainX, trainY)
    predY = results.predict(testX)
    resultDF = pd.DataFrame(data = predY, columns = ["Prediction"])
    testY = testY.reset_index(drop = True)
    resultDF = pd.concat([resultDF, testY], axis = 1)
    return resultDF

perceptronResults = perceptron(perceptronTrainX, trainY, perceptronTestX, testY)

In [88]:
# takes in as input a results dataframe from a model, as well 
# as a string for the model name, and then outputs 
# metrics relating to that model's results
def metricPrinter(df, modelString):
    print("Model:", modelString)
    resultPredictions = df["Prediction"]
    resultActuals = df["category_id"]
    precision = precision_score(resultActuals, resultPredictions, average = None)
    recall = recall_score(resultActuals, resultPredictions, average = None)
    f1 = f1_score(resultActuals, resultPredictions, average = None)
    accuracy = accuracy_score(resultActuals, resultPredictions)
    print("Accuracy:", accuracy)
    
metricPrinter(perceptronResults, "Perceptron")

Model: Perceptron
Accuracy: 0.7773608505315822


  _warn_prf(average, modifier, msg_start, len(result))
