In [1]:
import numpy as np
import pandas as pd
import sklearn.feature_extraction.text as extraction
from langdetect import detect
import time

Preprocessing:
- removing all data whos feature vector is irregular (longer than the column of 44 or something)
- removed all commas in CSV so that it doesn't interfere with the read_csv
- drop features with missing values
- deleted non-movies
- threw out data with multiple parentheses appearences
- only english titles
- removed year and other parentheses from title

TO DO:
- Parse our own title


In [2]:
def load_data():
    data = pd.read_csv("./imdbmovies/imdb.csv")
    
    idxRemove = data[np.logical_not(np.isnan(data['Unnamed: 44']))].index
    dataCleaned = data.drop(idxRemove)
    idxNonMovie = dataCleaned[dataCleaned["type"] != 'video.movie'].index
    dataCleaned.drop(idxNonMovie, inplace=True)
    idxTwoParen = dataCleaned[dataCleaned["title"].str.contains("\(.*\(")].index
    dataCleaned.drop(idxTwoParen, inplace=True)

    dataCleaned.drop(columns=["Unnamed: 44", "Unnamed: 45", "Unnamed: 46", "Unnamed: 47"], inplace=True)
    dataCleaned.dropna(axis=0, inplace=True)
    dataCleaned.reset_index(inplace=True)
    dataCleaned.drop(columns=["index", "url", "tid", "fn", "wordsInTitle", "type"], inplace=True)
    dataCleaned["title"] = dataCleaned["title"].str.replace(' \(.*[0-9]{4}\)', '')
    
    x = dataCleaned.iloc[:, :11]
    y = dataCleaned.iloc[:, 11:]
    
    return x, y

In [12]:
x, y = load_data()

In [4]:
y.columns

Index(['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy',
       'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'FilmNoir',
       'GameShow', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News',
       'RealityTV', 'Romance', 'SciFi', 'Short', 'Sport', 'TalkShow',
       'Thriller', 'War', 'Western'],
      dtype='object')

In [5]:
x.columns

Index(['title', 'imdbRating', 'ratingCount', 'duration', 'year', 'nrOfWins',
       'nrOfNominations', 'nrOfPhotos', 'nrOfNewsArticles', 'nrOfUserReviews',
       'nrOfGenre'],
      dtype='object')

In [6]:
idxEng = []
for idx, k in x.title.iteritems():
    try:
        if detect(k) == "en":
            idxEng.append(idx)
    except:
        print("FAILED ON", k)

In [15]:
engX = x.loc[idxEng, :]

In [16]:
engX

Unnamed: 0,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,nrOfGenre
20,It's a Wonderful Life,8.7,197764,7800,1946,6,0,65,531,626,3
27,Rashomon,8.4,74219,5280,1950,9,2,19,102,238,2
74,Butch Cassidy und Sundance Kid,8.2,111087,6600,1969,18,12,51,508,281,3
79,Chinatown,8.3,159935,7800,1974,20,22,73,547,412,3
86,Network,8.2,75434,7260,1976,22,19,49,366,295,1
...,...,...,...,...,...,...,...,...,...,...,...
10083,The Time of Your Life,6.4,717,6540,1948,0,0,26,14,32,2
10084,Working with Animals: 'The Scorpion King',6.5,11,360,2002,0,0,0,0,1,2
10089,VeggieTales: Where's God When I'm S-Scared?,7.4,386,1800,1993,0,0,10,0,2,3
10092,Scaramouche,7.3,321,7440,1923,0,0,2,4,14,2


# Text Vectorization Practice

In [86]:
tfidfVector = extraction.TfidfVectorizer()
hashVector = extraction.HashingVectorizer()

In [84]:
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
titles = x.title.tolist()

In [85]:
titles

['Der Vagabund und das Kind',
 'Goldrausch',
 'Metropolis',
 'Der General',
 'Lichter der Großstadt',
 'M',
 'Es geschah in einer Nacht',
 'Moderne Zeiten',
 'Vom Winde verweht',
 'Mr. Smith geht nach Washington',
 'Der Zauberer von Oz',
 'Früchte des Zorns',
 'Der große Diktator',
 'Rebecca',
 'Citizen Kane',
 'Die Spur des Falken',
 'Casablanca',
 'Frau ohne Gewissen',
 'Die besten Jahre unseres Lebens',
 'Tote schlafen fest',
 "It's a Wonderful Life",
 'Berüchtigt',
 'Fahrraddiebe',
 'Cocktail für eine Leiche',
 'Der Schatz der Sierra Madre',
 'Der dritte Mann',
 'Alles über Eva',
 'Rashomon',
 'Boulevard der Dämmerung',
 'Der Fremde im Zug',
 'Zwölf Uhr mittags',
 'Ikiru - Einmal richtig leben',
 'Du sollst mein Glücksstern sein',
 'Ein Herz und eine Krone',
 'Stalag 17',
 'Die Teuflischen',
 'Bei Anruf Mord',
 'Die Faust im Nacken',
 'Das Fenster zum Hof',
 'Die sieben Samurai',
 'Das Lied der Straße',
 'Die Nacht des Jägers',
 'Killing',
 'Die zwölf Geschworenen',
 'Die Brücke am

In [133]:
tempX = x[:100]

In [142]:
tempX[tempX.title[:100].apply(detect) == "en"]

Unnamed: 0,title,imdbRating,ratingCount,duration,year,nrOfWins,nrOfNominations,nrOfPhotos,nrOfNewsArticles,nrOfUserReviews,nrOfGenre
20,It's a Wonderful Life,8.7,197764,7800,1946,6,0,65,531,626,3
27,Rashomon,8.4,74219,5280,1950,9,2,19,102,238,2
74,Butch Cassidy und Sundance Kid,8.2,111087,6600,1969,18,12,51,508,281,3
79,Chinatown,8.3,159935,7800,1974,20,22,73,547,412,3
86,Network,8.2,75434,7260,1976,22,19,49,366,295,1
88,Taxi Driver,8.4,351181,6780,1976,26,9,80,1830,768,2
93,Apocalypse Now,8.5,333178,9180,1979,17,30,76,1356,896,2


In [89]:
test = ['Der große Diktator']

In [106]:
transform = tfidfVector.fit_transform(titles)

In [107]:
print(transform)

  (0, 5223)	0.5698597033536394
  (0, 2133)	0.302226473722024
  (0, 10058)	0.32434359790517586
  (0, 10178)	0.6565152043451231
  (0, 2218)	0.2184218259168701
  (1, 3886)	1.0
  (2, 6263)	1.0
  (3, 3662)	0.9488637667278759
  (3, 2218)	0.31568584414095496
  (4, 4002)	0.6592533797461096
  (4, 5692)	0.7134704605042111
  (4, 2218)	0.2373707717501053
  (6, 6643)	0.4101051014664745
  (6, 2651)	0.44013267595613675
  (6, 4646)	0.29074965301974914
  (6, 3703)	0.5930031102415386
  (6, 2874)	0.44934283550494036
  (7, 11000)	0.6658551974364304
  (7, 6393)	0.7460809983151241
  (8, 10342)	0.6429125746398725
  (8, 10729)	0.6155503296213121
  (8, 10440)	0.4558083073760552
  (9, 10546)	0.5412586174363675
  (9, 6635)	0.36154944574335857
  (9, 3616)	0.4349375803624488
  :	:
  (10091, 9079)	0.5133636343368432
  (10091, 9791)	0.5439751032394323
  (10091, 2218)	0.21564642908768678
  (10092, 8237)	1.0
  (10093, 10726)	0.6523288532876372
  (10093, 1482)	0.5821841842644808
  (10093, 10935)	0.4853129328183386
  (1

In [113]:
tfidfVector.get_feature_names()

['00',
 '000',
 '007',
 '01',
 '10',
 '100',
 '1000',
 '1001',
 '101',
 '102',
 '105',
 '109',
 '10th',
 '11',
 '1138',
 '117',
 '12',
 '120',
 '127',
 '13',
 '13th',
 '14',
 '1408',
 '1453',
 '1492',
 '15',
 '16',
 '17',
 '174',
 '1776',
 '18',
 '180',
 '1812',
 '18th',
 '19',
 '1901',
 '1911',
 '1931',
 '1935',
 '1937',
 '1938',
 '1939',
 '1941',
 '1942',
 '1944',
 '1950',
 '1953',
 '1962',
 '1974',
 '1980',
 '1983',
 '1987',
 '1989',
 '1991',
 '1998',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2004',
 '2006',
 '2009',
 '2011',
 '2012',
 '2019',
 '2022',
 '2046',
 '2077',
 '20th',
 '21',
 '211',
 '22',
 '23',
 '237',
 '23rd',
 '24',
 '25',
 '25th',
 '27',
 '28',
 '2nd',
 '30',
 '300',
 '3000',
 '33',
 '35',
 '36',
 '39',
 '3d',
 '40',
 '42',
 '420',
 '43',
 '451',
 '45er',
 '46',
 '47',
 '48',
 '49th',
 '50',
 '51',
 '51st',
 '52',
 '54',
 '5555',
 '56th',
 '57',
 '571',
 '5ecret',
 '5tar',
 '5th',
 '5tory',
 '5ystem',
 '60',
 '61',
 '66',
 '666',
 '68',
 '6th',
 '

In [94]:
preprocess = tfidfVector.build_preprocessor()

In [96]:
preprocess('Savage Harvest 2: October Blood')

'savage harvest 2: october blood'

In [91]:
display(tfidfVector.vocabulary_)

{'der': 0, 'große': 2, 'diktator': 1}

In [92]:
display(tfidfVector.idf_)

array([1., 1., 1.])

In [29]:
display(vectorizer.transform([text[0]]).toarray())

array([[0.36388646, 0.27674503, 0.27674503, 0.36388646, 0.36388646,
        0.36388646, 0.36388646, 0.42983441]])