In [127]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as extraction
from langdetect import detect
import time
from google.cloud import translate_v2 as translate
import nltk
import nltk.stem as stem
from nltk import word_tokenize as tokenize
import re

Preprocessing:
- removing all data whos feature vector is irregular (longer than the column of 44 or something)
- removed all commas in CSV so that it doesn't interfere with the read_csv
- drop features with missing values
- deleted non-movies
- threw out data with multiple parentheses appearences
- translate all titles to english
- removed year and other parentheses from title
- choosing to use snowball(Porter2) stemmer

TO DO:
- Parse our own title

Visualization:
- distribution of languages for movies (maybe keep track of performance for certain language of movie titles)


# Loading  Data

In [238]:
def load_data():
    client = translate.Client()
    data = pd.read_csv("./imdbmovies/imdb.csv")
    
    idxRemove = data[np.logical_not(np.isnan(data['Unnamed: 44']))].index
    dataCleaned = data.drop(idxRemove)
    idxNonMovie = dataCleaned[dataCleaned["type"] != 'video.movie'].index
    dataCleaned.drop(idxNonMovie, inplace=True)
    idxTwoParen = dataCleaned[dataCleaned["title"].str.contains("\(.*\(")].index
    dataCleaned.drop(idxTwoParen, inplace=True)

    dataCleaned.drop(columns=["Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47"], inplace=True)
    dataCleaned.dropna(axis=0, inplace=True)
    dataCleaned.reset_index(inplace=True)
    dataCleaned.drop(columns=["index", "url", "tid", "fn", "wordsInTitle", "type"], inplace=True)
    dataCleaned.title = dataCleaned.title.apply(lambda title: re.sub(r' \(([ a-zA-Z]*)([0-9]{4})\)', r'\1', title))\
                        .apply(lambda title: client.translate(title)["translatedText"])\
                        
    
    X = dataCleaned.iloc[:, :11]
    y = dataCleaned.iloc[:, 11:]
    
    return X, y

In [239]:
start = time.time()
origX, y = load_data()
print("Time to load data: {} seconds".format(time.time() - start))

Time to load data: 0.13455867767333984 seconds


In [227]:
print("Labels ", y.columns.to_list())
print("Features ", X.columns.to_list())

Labels  ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'FilmNoir', 'GameShow', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'RealityTV', 'Romance', 'SciFi', 'Short', 'Sport', 'TalkShow', 'Thriller', 'War', 'Western']
Features  ['title', 'imdbRating', 'ratingCount', 'duration', 'year', 'nrOfWins', 'nrOfNominations', 'nrOfPhotos', 'nrOfNewsArticles', 'nrOfUserReviews', 'nrOfGenre']


In [228]:
X.to_csv("./imdbmovies/features.csv", index=False)
y.to_csv("./imdbmovies/labels.csv", index=False)

In [243]:
origX.title = origX.title.apply(lambda title: re.sub(r' \(([ a-zA-Z]*)([0-9]{4})\)', r'\1', title))

In [247]:
origX.title[origX.title.str.contains("[^0-9a-zA-Z]+")]

0                              Der Vagabund und das Kind
3                                            Der General
4                                  Lichter der Großstadt
6                              Es geschah in einer Nacht
7                                         Moderne Zeiten
                              ...                       
10091                             Herrin der toten Stadt
10093                                      You Can't Win
10094                                Startbahn ins Glück
10095        Die Abenteuer von Ichabod und Taddäus Kröte
10096    Ein Yankee aus Connecticut an König Arthurs Hof
Name: title, Length: 8751, dtype: object

# Text Vectorization

In [229]:
tfidfVector = extraction.TfidfVectorizer()
hashVector = extraction.HashingVectorizer()

stemmer = stem.snowball.EnglishStemmer()

In [230]:
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
titles = X.title.tolist()

In [236]:
[x for x in titles if "&#" in x]

['Twelve o&#39;clock in the afternoon',
 'One flew over the cuckoo&#39;s nest',
 'Picnic on Valentine&#39;s Day',
 'Something&#39;s going on in Hollywood',
 'The white band - A German children&#39;s story',
 'Breakfast at Tiffany&#39;s',
 'Colonel of Ryan&#39;s Express',
 'Akira Kurosawa&#39;s dreams',
 'Fellini&#39;s Satyricon',
 'Adam&#39;s apples',
 'Year One - It&#39;s hard to start',
 'Jo Nesbø&#39;s Headhunters',
 'Ivan&#39;s childhood',
 'May &#39;43 - The destruction of the dams',
 'Bill &amp; Ted&#39;s crazy journey through time',
 'Kiki&#39;s small delivery service',
 'Ice Age 2 - It&#39;s time to go',
 'Come and see&#39;',
 'Hunt for &#39;Red October&#39;',
 'I&#39;m fighting for you',
 'A midsummer night&#39;s dream',
 'Goya&#39;s spirits',
 'A Bug&#39;s Life',
 'God&#39;s work &amp; devil&#39;s contribution',
 'Harry Potter and the Philosopher&#39;s Stone',
 'Law of the Road - Brooklyn&#39;s Finest',
 'Welcome to the Sch&#39;tis',
 'That&#39;s not yesterday',
 'Portrait of

In [60]:
transOrig = tfidfVector.fit_transform(titles)

In [66]:
print(transOrig)

  (0, 5223)	0.5698597033536394
  (0, 2133)	0.302226473722024
  (0, 10058)	0.32434359790517586
  (0, 10178)	0.6565152043451231
  (0, 2218)	0.2184218259168701
  (1, 3886)	1.0
  (2, 6263)	1.0
  (3, 3662)	0.9488637667278759
  (3, 2218)	0.31568584414095496
  (4, 4002)	0.6592533797461096
  (4, 5692)	0.7134704605042111
  (4, 2218)	0.2373707717501053
  (6, 6643)	0.4101051014664745
  (6, 2651)	0.44013267595613675
  (6, 4646)	0.29074965301974914
  (6, 3703)	0.5930031102415386
  (6, 2874)	0.44934283550494036
  (7, 11000)	0.6658551974364304
  (7, 6393)	0.7460809983151241
  (8, 10342)	0.6429125746398725
  (8, 10729)	0.6155503296213121
  (8, 10440)	0.4558083073760552
  (9, 10546)	0.5412586174363675
  (9, 6635)	0.36154944574335857
  (9, 3616)	0.4349375803624488
  :	:
  (10091, 9079)	0.5133636343368432
  (10091, 9791)	0.5439751032394323
  (10091, 2218)	0.21564642908768678
  (10092, 8237)	1.0
  (10093, 10726)	0.6523288532876372
  (10093, 1482)	0.5821841842644808
  (10093, 10935)	0.4853129328183386
  (1

In [78]:
transEng = tfidfVectorEng.fit_transform(titlesEng)

In [80]:
print(tfidfVectorEng.transform(['The']))

  (0, 7665)	1.0


In [74]:
tfidfVectorEng.get_stop_words()

In [65]:
tfidfVectorEng.get_feature_names()

['00',
 '000',
 '007',
 '01',
 '10',
 '100',
 '1000',
 '1001',
 '101',
 '102',
 '105',
 '109',
 '10th',
 '11',
 '1138',
 '117',
 '12',
 '120',
 '127',
 '13',
 '13th',
 '14',
 '1408',
 '1453',
 '1492',
 '15',
 '16',
 '17',
 '174',
 '1776',
 '18',
 '180',
 '1812',
 '18th',
 '19',
 '1901',
 '1911',
 '1931',
 '1935',
 '1937',
 '1938',
 '1939',
 '1941',
 '1942',
 '1944',
 '1950',
 '1953',
 '1962',
 '1974',
 '1980',
 '1983',
 '1987',
 '1989',
 '1991',
 '1998',
 '1999',
 '19th',
 '1st',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2004',
 '2006',
 '2009',
 '2011',
 '2012',
 '2019',
 '2022',
 '2046',
 '2077',
 '20th',
 '21',
 '211',
 '21st',
 '22',
 '23',
 '237',
 '23rd',
 '24',
 '25',
 '25th',
 '27',
 '27th',
 '28',
 '2nd',
 '30',
 '300',
 '3000',
 '33',
 '35',
 '36',
 '39',
 '3d',
 '3rd',
 '40',
 '420',
 '42nd',
 '43',
 '45',
 '451',
 '46',
 '47',
 '48',
 '49th',
 '4th',
 '50',
 '51',
 '51st',
 '52',
 '54',
 '5555',
 '56th',
 '57',
 '571',
 '5ecret',
 '5tar',
 '5th',
 '5tory',
 '5ystem',
 '60

In [119]:
lem = stem.WordNetLemmatizer()
stemmer = stem.snowball.EnglishStemmer()

In [130]:
testStr = 'A Beautiful Minds - genius and madness 2019th'
" ".join([stemmer.stem(x) for x in tokenize(testStr) if bool(re.match('^[0-9a-zA-Z]+$', x))])

'a beauti mind genius and mad 2019th'

In [113]:
porter.stem("articles")

'articl'

In [None]:
tfidfVector.get_feature_names()

In [None]:
preprocess = tfidfVector.build_preprocessor()

In [None]:
preprocess('Savage Harvest 2: October Blood')

In [None]:
display(tfidfVector.vocabulary_)

In [None]:
display(tfidfVector.idf_)

In [None]:
display(vectorizer.transform([text[0]]).toarray())