In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as extraction
from langdetect import detect
import time
from google.cloud import translate_v2 as translate
import nltk
import nltk.stem as stem
from nltk import word_tokenize as tokenize
import re
import string
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='/Users/hamilton/Desktop/Stuff/GoogleAPIKey/MovieTranslation-f8371579c36e.json'

Preprocessing:
- removing all data whos feature vector is irregular (longer than the column of 44 or something)
- removed all commas in CSV so that it doesn't interfere with the read_csv
- drop features with missing values
- deleted non-movies
- threw out data with multiple parentheses appearences
- translate all titles to english
- removed year and other parentheses from title
- choosing to use snowball(Porter2) stemmer
- remove punctuation
- parsed titles and appended tfidf

Title Processing Steps:
- remove the year in title
- remove punctuation
- translate to english
- remove and added aposthrophes
- stem the title

Visualization?:
- distribution of languages for movies (maybe keep track of performance for certain language of movie titles)


# Loading  Data

In [2]:
def load_data():
    client = translate.Client()
    stemmer = stem.snowball.EnglishStemmer()    
    table = str.maketrans('','',string.punctuation)
    data = pd.read_csv("./imdbmovies/imdb.csv")
    
    idxRemove = data[np.logical_not(np.isnan(data['Unnamed: 44']))].index
    dataCleaned = data.drop(idxRemove)
    idxNonMovie = dataCleaned[dataCleaned["type"] != 'video.movie'].index
    dataCleaned.drop(idxNonMovie, inplace=True)
    idxTwoParen = dataCleaned[dataCleaned["title"].str.contains("\(.*\(")].index
    dataCleaned.drop(idxTwoParen, inplace=True)

    dataCleaned.drop(columns=["Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47"], inplace=True)
    dataCleaned.dropna(axis=0, inplace=True)
    dataCleaned.reset_index(inplace=True)
    dataCleaned.drop(columns=["index", "url", "tid", "fn", "wordsInTitle", "type"], inplace=True)
    dataCleaned.title = dataCleaned.title.apply(lambda title: re.sub(r'\(([ a-zA-Z]*)([0-9]{4})\)', r'\1', title))\
                                    .apply(lambda title: title.translate(table))\
                                    .apply(lambda title: client.translate(title)["translatedText"].replace("&#39;", ""))\
                                    .apply(lambda title: " ".join([stemmer.stem(word) for word in tokenize(title)]))
    
    X = dataCleaned.iloc[:, :11]
    y = dataCleaned.iloc[:, 11:]
    
    return X, y

In [13]:
start = time.time()
origX, y = load_data()
print("Time to load data: {} seconds".format(time.time() - start))

Time to load data: 0.3057441711425781 seconds


In [4]:
print("Labels ", y.columns.to_list())
print("Features ", X.columns.to_list())

NameError: name 'y' is not defined

In [None]:
# X.to_csv("./imdbmovies/features.csv", index=False)
# y.to_csv("./imdbmovies/labels.csv", index=False)

In [16]:
y = pd.read_csv("./imdbmovies/labels.csv")
display(y)       

Unnamed: 0,Action,Adult,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,1,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10092,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
10093,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
10094,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
10095,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
shortX = pd.read_csv("./imdbmovies/features.csv")
display(shortX)

Unnamed: 0,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,nrOfGenre
0,the vagabond and the child,8.4,40550,3240,1921,1,0,19,96,85,3
1,gold rush,8.3,45319,5700,1925,2,1,35,110,122,3
2,metropoli,8.4,81007,9180,1927,3,4,67,428,376,2
3,the general,8.3,37521,6420,1926,1,1,53,123,219,3
4,citi light,8.7,70057,5220,1931,2,0,38,187,186,3
...,...,...,...,...,...,...,...,...,...,...,...
10092,scaramouch,7.3,321,7440,1923,0,0,2,4,14,2
10093,you cant win,6.0,93,480,1948,0,0,0,0,4,2
10094,runway to luck,6.7,394,6000,1948,0,0,3,0,10,2
10095,the adventur of ichabod and taddäus toad,7.2,6340,4080,1949,0,0,8,20,37,3


In [15]:
fullX = pd.read_csv("./imdbmovies/features_vectorized.csv")
display(fullX)

Unnamed: 0,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,nrOfGenre,...,zpg,zu,zubeidaa,zuckermann,zui,zulu,zuo,zyklopen,åmål,æon
0,8.4,40550,3240,0.0,1,0,19,96,85,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8.3,45319,5700,0.0,2,1,35,110,122,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8.4,81007,9180,0.0,3,4,67,428,376,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8.3,37521,6420,0.0,1,1,53,123,219,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.7,70057,5220,0.0,2,0,38,187,186,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10092,7.3,321,7440,0.0,0,0,2,4,14,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10093,6.0,93,480,0.0,0,0,0,0,4,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10094,6.7,394,6000,0.0,0,0,3,0,10,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10095,7.2,6340,4080,0.0,0,0,8,20,37,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Text Vectorization

In [6]:
tfidfVector = extraction.TfidfVectorizer()
hashVector = extraction.HashingVectorizer()

In [7]:
#getting the TF matrix
tfidf = tfidfVector.fit_transform(X.pop('title'))

# adding "features" columns as SparseSeries
for i, col in enumerate(tfidfVector.get_feature_names()):
    X[col] = pd.Series(pd.SparseArray(tfidf[:, i].toarray().ravel(), fill_value=0))

KeyboardInterrupt: 

In [53]:
tfidfVector.get_feature_names()

['00',
 '000',
 '007',
 '01',
 '10',
 '100',
 '1000',
 '100000',
 '1001',
 '101',
 '102',
 '105',
 '109',
 '10speed',
 '10th',
 '111111',
 '1114',
 '1138',
 '117',
 '12',
 '120',
 '127',
 '13',
 '13th',
 '1408',
 '1453',
 '1492',
 '15',
 '16',
 '17',
 '174',
 '1776',
 '18',
 '180',
 '1812',
 '18th',
 '19',
 '19011980',
 '1911',
 '1931',
 '1935',
 '1937',
 '1938',
 '19391953',
 '1941',
 '1942',
 '1944',
 '1950',
 '1962',
 '1974',
 '1980',
 '1983',
 '1987',
 '19892004',
 '1991',
 '19982002',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '20000',
 '2001',
 '2002',
 '2004',
 '2006',
 '2009',
 '2011',
 '2012',
 '2019',
 '2022',
 '2046',
 '2077',
 '20th',
 '21',
 '211',
 '21st',
 '22',
 '23',
 '237',
 '23rd',
 '24',
 '25',
 '25th',
 '27',
 '27th',
 '28',
 '2head',
 '2nd',
 '30',
 '300',
 '3000',
 '33',
 '35',
 '36',
 '39',
 '3d',
 '40',
 '42',
 '420',
 '43',
 '45',
 '451',
 '46',
 '47',
 '48',
 '49th',
 '4ever',
 '50',
 '51',
 '51st',
 '52',
 '54',
 '5555',
 '56th',
 '57',
 '5ecret',
 '5tar',
 