In [31]:
import sqlalchemy
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
%matplotlib inline
sns.set()

In [47]:
class DataLoader(object):
    def __init__(self):
        self.movies_n_series, self.ratings = self.load_imdb()
        self.wiki = self.load_wiki()

    @staticmethod
    def open_json(full_file_path):
        with open(full_file_path, 'r') as jfile:
            return json.load(jfile)

    def load_from_path(self, path_for_data_files):
        full_path_file_names = list(map(lambda p: os.path.join(path_for_data_files, p), os.listdir(path_for_data_files)))
        only_valid_files = list(filter(lambda file: '.json' in file and not ('!' in file or '?' in file),
                                       full_path_file_names))
        return list(map(self.open_json, only_valid_files))

    def load_imdb(self):
        movies_n_series = []
        ratings = []
        path_for_movie_jsons = 'raw_data'
        for movie_details in self.load_from_path(path_for_movie_jsons):
                if 'Ratings' in movie_details:
                    for rating in movie_details['Ratings']:
                        rating['imdbID'] = movie_details['imdbID']

                    ratings += movie_details.pop('Ratings')
                movies_n_series.append(movie_details)

        return pd.DataFrame(movies_n_series), pd.DataFrame(ratings)
    
    def load_wiki(self):
        path_for_wiki_jsons = os.path.join('raw_data', 'wiki_data')
        wiki = self.load_from_path(path_for_wiki_jsons)

        return pd.DataFrame(wiki)
dfs = {
        'movies_n_series_df': DataLoader().movies_n_series,
        'rating_df': DataLoader().ratings,
        'wiki_df': DataLoader().wiki
         }

In [48]:
dfs

{'movies_n_series_df':                                                  Actors  \
 0     Luke Wilson, Maya Rudolph, Dax Shepard, Terry ...   
 1     Thomas Jane, Marcia Gay Harden, Laurie Holden,...   
 2     AnnaSophia Robb, Helen Hunt, Dennis Quaid, Car...   
 3     Woody Allen, Carolyn Saxon, Tracey Ullman, Mic...   
 4     Art LaFleur, Tom Guiry, Mike Vitar, Patrick Renna   
 5     Jeffrey Dean Morgan, Kyra Sedgwick, Natasha Ca...   
 6     Hugh Grant, Sarah Jessica Parker, Natalia Klim...   
 7     Channing Tatum, Jamie Foxx, Maggie Gyllenhaal,...   
 8     Olga Kurylenko, Tom Brooke, Paddy Considine, J...   
 9     Nicolas Cage, Sam Rockwell, Alison Lohman, Bru...   
 10    Gerard Butler, Emmy Rossum, Patrick Wilson, Mi...   
 11    Pål Sverre Hagen, Anders Baasmo Christiansen, ...   
 12    Gerard Butler, Christopher Plummer, Jonny Lee ...   
 13    Jesse Eisenberg, Kelsey Ledgin, Michael Zegen,...   
 14    Jamie Lee Curtis, Lindsay Lohan, Mark Harmon, ...   
 15    Amy Schumer

# Movies table

In [None]:
engine = sqlalchemy.create_engine('sqlite:///{}'.format(os.path.join(os.getcwd(), 'imdb_test.db')))
movies = pd.read_sql("""select * from movies""", con=engine)

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
movies.columns

# extract year from 'Released' and 'year'

In [None]:
set(movies['Year'].str.extract('(\D+)'))

In [None]:

mean_years_from_range = lambda years: np.mean([int(year) for year in years.split(r'–')])
movies['Year'] = movies['Year'].apply(mean_years_from_range)     

In [None]:
movies['Released_year'] = movies['Released'].str.extract(r'\w (\d+)').astype(float)
movies.query('Released_year != Year')[['Released', 'Released_year', 'Year']].head()

In [None]:
np.abs(movies['Released_year'] - movies['Year']).value_counts()

The difference is not so big so I'll stick to the 'Year' column

In [None]:
movies.drop('Released_year', axis=1, inplace=True)

# 'Runtime' column

In [None]:
set(movies.Runtime.str.extract('(\D+)').values)

# 'Rated' column

In [None]:
movies.Rated.value_counts()

# 'Type' columns

In [None]:
sns.countplot(movies.Type);

# 'Metascore' column ?

In [None]:
movies.Metascore.head(10)

# 'Awards' column

In [None]:
total_awards = ' '.join(movies.Awards.values)
total_awards = re.sub('[\.,\d]', '', total_awards)
total_awards

In [None]:
from collections import Counter
Counter(total_awards.split())

In [None]:
splited_awards = [] 
for award in movies.Awards.values:
#     award = re.sub('\.\s?\b', '', award)
#     print(award)
    for w in award.split('&'):
        for i in w.split('.'):
            splited_awards.append(i)
#     print()
set(splited_awards)

In [None]:
def extract_win_and_nominate(awards_string):
    win_number, nominate_number = 0, 0
    for i in re.split('&|,|\.', str(awards_string)):
        if 'nominat' in i.lower():
            nominate_number += int(re.search('\d+', i).group(0))
        elif ('win' in i.lower()) or ('won' in i.lower()):
            win_number += int(re.search('\d+', i).group(0))

    return win_number, nominate_number

In [None]:
win_number, nominate_number = zip(*movies['Awards'].apply(extract_win_and_nominate))
list(zip(win_number, nominate_number, movies['Awards']))

# extract from comma sperated strings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(tokenizer=lambda t: re.split(' , |, |,| ,', t))

In [None]:
movies.loc[:2, 'Language']

In [None]:
pd.DataFrame(vec.fit_transform(movies.loc[:2, 'Language']).toarray(), columns=vec.get_feature_names())

# 'BoxOffice'

In [None]:
for i in set(movies['BoxOffice'].str.replace('\D', '').replace('', np.nan)):
    print(i)
    print(float(i))
    print('-------------------')

# clean

In [None]:
def clean_movies(movies_df):
    movies_df = movies_df.copy()
    # drop unrelevant columns
    movies_df.drop(['id', 'DVD', 'Website', 'Response', 'Poster', 'Released'], axis=1, inplace=True)
    # conver to nan
    movies_df.replace('N/A', np.nan, inplace=True)
    
    movies_df['imdbVotes'] = movies_df['imdbVotes'].str.replace(',', '').astype(float)
    
    BoxOffice_pound_index = movies_df.dropna()[~movies_df['BoxOffice'].dropna().str.contains('\$')].index
    movies_df['BoxOffice'] = movies_df.BoxOffice.str.replace('\D', '').replace('', np.nan).astype(float)
    movies_df.loc[BoxOffice_pound_index, 'BoxOffice'] *= 1.3
    
    mean_years_from_range = lambda years: np.mean([float(year) for year in str(years).split(r'–')])
    movies['Year'] = movies['Year'].apply(mean_years_from_range)
    
    
    movies_df['Runtime'] = movies.Runtime.str.extract('(\d+)').astype(float)
    return movies_df

In [None]:
movies[~movies['BoxOffice'].str.contains('\$')].index

In [None]:
clean = clean_movies(movies)

In [None]:
clean.info()

In [None]:
clean.head()

In [None]:
def extract_from_comma_sperated_strings(full_df, column_name):
    vec = CountVectorizer(tokenizer=lambda t: re.split(' , |, |,| ,', t))
    
    df_array = vec.fit_transform(full_df[column_name].fillna('Not_provided')).toarray()
    fields = ['{}_{}'.format(column_name, col) for col in vec.get_feature_names()]

    return pd.DataFrame(df_array, columns=fields)

In [None]:
def extract(movies):
    movies_df = movies.copy()
    
    movies_df['Awards_wins'], movies_df['Awards_nominate'] = zip(*movies_df['Awards'].apply(extract_win_and_nominate))
    movies_df.drop('Awards', axis=1, inplace=True)
    
    for column_name in ['Country', 'Director', 'Genre', 'Language', 'Actors', 'Production', 'Writer']:
        if column_name ==  'Director': # There are some co-directors which is noted with perentesis
            movies_df['Director'] = movies_df['Director'].str.replace('\(.+\)', '')
        movies_df = movies_df.join(extract_from_comma_sperated_strings(movies_df, column_name))
        movies_df.drop(column_name, axis=1, inplace=True)
        
    
    
    return movies_df

In [None]:
extracted = extract(clean)
extracted.select_dtypes(include='object').head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')

from nltk.stem import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        stemmer = SnowballStemmer("english")
#         lemmatizer = wordnet.WordNetLemmatizer()
        stops = set(stopwords.words("english"))
        return lambda doc: ([stemmer.stem(w) for w in analyzer(re.sub("[^\w\s]", "", doc)) if w not in stops])
#         return lambda doc: ([lemmatizer.lemmatize(w) for w in analyzer(re.sub("[^\w\s]", "", doc)) if w not in stops])

In [None]:
extracted.loc[0, 'Plot']

In [None]:
vec = StemmedCountVectorizer()
d = vec.fit_transform(extracted['Title']).toarray()
fields = vec.get_feature_names()
df = pd.DataFrame(d, columns=fields)

In [None]:
df.shape

In [None]:
from sklearn import metrics
cosine_simmilarity = metrics.pairwise.cosine_similarity(df)

In [None]:
cosine_simmilarity_df = pd.DataFrame(cosine_simmilarity, columns=df.index)

In [None]:
extracted[extracted.imdbID == 'tt0071562']

In [None]:
cosine_simmilarity_df.loc[2773, :].sort_values()

In [None]:
extracted.loc[3472, 'Title']

In [None]:
extracted.loc[2773, 'Title']

In [None]:
import logging
import gensim
import gzip

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    logging.info("reading file {0}...this may take a while".format(input_file))
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess(line)

In [None]:
text_path = gensim.downloader.load("semeval-2016-2017-task3-subtaskA-unannotated", return_path=True)

In [None]:
documents = list(read_input(text_path))

In [None]:
stemmer = SnowballStemmer("english")
for i, line in enumerate(documents):
    for w, word in enumerate(line):
        stem_word = stemmer.stem(re.sub(r'[^\w\s]', '', word))
        documents[i][w] = stem_word

In [None]:
Genre_cols = [col for col in extracted.columns if 'Genre' in col]
Genre_cols

In [None]:
np.sum(extracted[Genre_cols])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

Genre_cols.remove('Genre_documentary')
X = extracted['Plot'].drop(extracted[extracted['Genre_documentary'] == 1].index)
y = extracted[Genre_cols].drop(extracted[extracted['Genre_documentary'] == 1].index)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

model = Pipeline([
    ('vec', StemmedCountVectorizer()),
    ('logreg', OneVsRestClassifier(LogisticRegression(penalty='l1')))
])

In [None]:
model.fit(x_train, y_train)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))