In [6]:
import numpy as np
import pandas as pd
from RedditDataPreparation.RedditApiConnector import RedditApiConnector
from RedditDataPreparation.DataPreprocessing import DataPreprocesser
from Models.FirstCNN import FirstCNN
import logging

logging.getLogger('tensorflow').setLevel(logging.WARNING)


class RedditBasedPredictor:

    def __init__(self, name, subreddit='movies'):
        self.subreddit = subreddit
        self.model = FirstCNN()
        self.connector = RedditApiConnector(self.subreddit)
        self.data = self.get_data(name)
        self.preprocessor = DataPreprocesser(self.data, 'body')

    def get_data(self, name):
        return self.connector.search_comments(name)

    def make_prediction(self):
        clean_data = self.preprocessor.full_prepare_data(self.data, 'body')
        return self.model.predict_sentiment(clean_data)

    # Pomyslec nad wagami takimi zeby wywalalo te z ujemnym scorem
    def prepare_avg(self):
        preds = self.make_prediction()
        weights_reshape = np.reshape(self.data['score'].values, (preds.shape))
        return np.average(preds, weights=weights_reshape)

class TestPredictor:
    def __init__(self):
        self.data = pd.read_csv('Data/titles_with_reviews.csv')
        self.data['preds'] = np.NaN
        self.data['preds_diff'] = np.NaN

    def make_preds(self):
        for i, movie in enumerate(self.data['original_title']):
            try:
                predictor = RedditBasedPredictor(movie)
                prediction = predictor.prepare_avg()
                self.data['preds'].iloc[i] = prediction * 100
                self.data['preds_diff'] = np.absolute(self.data['preds']  - self.data['preds'])
                print(self.data.iloc[i])
                if i % 5 == 0:
                    self.data.to_csv('preds.csv', mode='a')
            except KeyError:
                print('No reddit comments')
                
class DataHandling:
    
    def __init__(self):
        self.data = self.preparing_data()
        
    def preparing_data(self):
        top50 = pd.read_csv('Data/Filmweb_top50.csv', index_col=0)
        top50['imdb'] = (top50['imdb'].str.replace('/10', '').astype(float)) / 10
        top50['rottenTomatoes'] = top50['rottenTomatoes'].str.replace('%', '').astype(float) / 100
        top50['metacritic'] = top50['metacritic'].str.replace('/100', '').astype(float) / 100

        return top50

    def measure_one_movie(self, titleFromPrediction):
        predictor = RedditBasedPredictor(titleFromPrediction)
        resultFromPrediction = predictor.prepare_avg()
        
        errorIMDB = round(abs((resultFromPrediction - self.data.loc[titleFromPrediction, 'imdb']) * 100), 2)
        errorRotten = round(abs((resultFromPrediction - self.data.loc[titleFromPrediction, 'rottenTomatoes']) * 100), 2)
        errorMetacritic = round(abs((resultFromPrediction - self.data.loc[titleFromPrediction, 'metacritic']) * 100), 2)

        data = [titleFromPrediction, self.data.loc[titleFromPrediction, 'year'], resultFromPrediction,
                self.data.loc[titleFromPrediction, 'imdb'], errorIMDB, self.data.loc[titleFromPrediction, 'rottenTomatoes'],
                errorRotten, self.data.loc[titleFromPrediction, 'metacritic'], errorMetacritic]
        print(data)
        return data
    

    def measure_all_movies(self):
    
        full_list = []
        columns = ['title', 'year', 'prediction', 'imdb', 'errorIMDB', 'rottenTomatoes', 'errorRotten', 'metacritic',
                   'errorMetacritic']
        data_df = pd.DataFrame(columns=columns)
        for i, movie in enumerate(self.data.index):
            print(movie)
            data_df.loc[len(data_df.index)] = self.measure_one_movie(movie)
            if i % 5 == 0:
                data_df.to_csv('top_50_preds.csv', mode='a')
        
#         data_df = data_df.set_index('title')
#         result_df = result_df.append(data_df)
        

    
if __name__ == '__main__':
    FinalTest = DataHandling()
    FinalTest.measure_all_movies()
#     test = TestPredictor()
#     test.make_preds()


The Shawshank Redemption
['The Shawshank Redemption', 1994, 0.5775977699545221, 0.93, 35.24, 0.91, 33.24, 0.8, 22.24]
The Intouchables
['The Intouchables', 2011, 0.4615832603238329, 0.85, 38.84, 0.75, 28.84, 0.57, 10.84]
The Green Mile
['The Green Mile', 1999, 0.53103310917134, 0.86, 32.9, 0.78, 24.9, 0.61, 7.9]
The Godfather
['The Godfather', 1972, 0.6263442253248638, 0.9199999999999999, 29.37, 0.97, 34.37, 1.0, 37.37]
12 Angry Men
['12 Angry Men', 1957, 0.6600836086514343, 0.9, 23.99, 1.0, 33.99, 0.96, 29.99]
Forrest Gump
['Forrest Gump', 1994, 0.7160076367328951, 0.8800000000000001, 16.4, 0.71, 0.6, 0.82, 10.4]
The Godfather: Part II
['The Godfather: Part II', 1974, 0.4160768239039133, 0.9, 48.39, 0.96, 54.39, 0.9, 48.39]
The Lord of the Rings: The Return of the King
['The Lord of the Rings: The Return of the King', 2003, 0.5890185130947339, 0.89, 30.1, 0.93, 34.1, 0.94, 35.1]
Schindler's List
["Schindler's List", 1993, 0.5076344238768379, 0.89, 38.24, 0.98, 47.24, 0.94, 43.24]
Pulp

NameError: name 'i' is not defined

In [16]:
FinalTest = DataHandling()
test_data = FinalTest.preparing_data()
        

In [18]:
test_data

In [7]:
op50 = pd.read_csv('Data/Filmweb_top50.csv', index_col=0)

In [12]:
op50.index

Index(['The Shawshank Redemption', 'The Intouchables', 'The Green Mile',
       'The Godfather', '12 Angry Men', 'Forrest Gump',
       'The Godfather: Part II',
       'The Lord of the Rings: The Return of the King', 'Schindler's List',
       'Pulp Fiction', 'Se7en', 'Joker',
       'The Lord of the Rings: The Two Towers', 'Fight Club', 'Goodfellas',
       'The Pianist', 'A Beautiful Mind', 'Inception', 'Django Unchained',
       'The Silence of the Lambs', 'The Lion King', 'Scarface', 'Gran Torino',
       'Shutter Island', 'Coco', 'American History X', 'Green Book',
       'Leon: The Professional', 'Gladiator', 'The Boy in the Striped Pyjamas',
       'Saving Private Ryan', 'Braveheart', 'Whiplash', 'Hacksaw Ridge',
       'Good Will Hunting', 'Apocalypse Now', 'Scent of a Woman',
       'The Prestige', 'Avengers: Infinity War',
       'Star Wars: Episode V - The Empire Strikes Back',
       'Once Upon a Time in America',
       'Star Wars: Episode VI - Return of the Jedi', 'The H

In [9]:
preds['preds_diff'] = preds['preds'] - preds['reviews_from_users']

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [10]:
np.NaN - 1

nan