In [86]:
import json
import ast
import pandas
from collections import defaultdict
import numpy as np
from sklearn import svm

## Put Reviews in Readable Format

### Important Output: female_reviews, male_reviews

In [5]:
female_reviews = [] # list of review dictionaries for females
count_errors = 0
with open('review_buckets_female.txt') as data_file:    
    for line in data_file:
        try:
            female_reviews.append(ast.literal_eval(line))
        except:
            line = "{'"+line[2:]
            try:
                female_reviews.append(ast.literal_eval(line))
            except:
                count_errors+=1
print(str(len(female_reviews))+" female reviews read with "+str(count_errors)+ " error(s).")

# fixing key issues
for i in range(len(female_reviews)):
    review = female_reviews[i]
    for key in review.keys():
        if 'movie_id' in key:
            review['movie_id'] = review[key]
        if 'icon' in key:
            review['icon'] = review[key]
    female_reviews[i] = review
    
male_reviews = [] # list of review dictionaries for males
count_errors = 0
with open('review_buckets_male.txt') as data_file:    
    for line in data_file:
        try:
            male_reviews.append(ast.literal_eval(line))
        except:
            count_errors+=1
print(str(len(male_reviews))+" male reviews read with "+str(count_errors)+ " error(s).")

# fixing key issues
for i in range(len(male_reviews)):
    review = male_reviews[i]
    for key in review.keys():
        if 'movie_id' in key:
            review['movie_id'] = review[key]
        if 'icon' in key:
            review['icon'] = review[key]
    male_reviews[i] = review

64366 female reviews read with 0 error(s).
64366 male reviews read with 0 error(s).


In [6]:
# format of review, for what its worth
male_reviews[0]

{'date': 'April 4, 2011',
 'gender': 'm',
 'icon': 'rotten',
 'id': 'dave-white',
 'link': 'http://www.movies.com/movie-reviews/percy-jackson-the-olympians-the-lightning-thief-review/dave-white/m60204',
 'movie_id': '0814255',
 'name': 'Dave White',
 'review': 'It kind of just builds up to a lot of Not Much.',
 'source': 'Movies.com'}

## Load in Movie Data

### Important Output: movie_data, movie_titles

In [7]:
movie_data = pandas.read_csv('movie_data_cleaned.csv')
movie_titles = [repr(x).split("\\", 1)[0].lower()[1:] for x in list(movie_data['movie_title'])]
movie_titles = [x[:-1] for x in movie_titles]
print("There are "+str(len(movie_titles))+" movies in imdb dataset")
print(np.shape(movie_data)) # movie x feature matrix

There are 3801 movies in imdb dataset
(3801, 45)


## Extract Movies We Have Reviews For
### Important Output: male_review_movie_list, female_review_movie_list

In [8]:
male_review_movie_list = [] # list of movie_ids for male reviews
male_movie_to_review = defaultdict(list) # movie_id -> lst of reviews
for i in male_reviews:
    if not str.isdigit(i['movie_id']):
        male_review_movie_list.append(i['movie_id'])
        male_movie_to_review[i["movie_id"]].append(i)
male_review_movie_list = set(male_review_movie_list)

female_review_movie_list = []
female_movie_to_review = defaultdict(list)
for i in female_reviews:
    if not str.isdigit(i['movie_id']):
        female_review_movie_list.append(i['movie_id'])
        female_movie_to_review[i["movie_id"]].append(i)
female_review_movie_list = set(female_review_movie_list)

## Get Movies in Both Reviews and IMDB (Cross Datasets)

### Important Output: review_movie_id_to_imdb_id, overlap_whatever (all the same)

In [9]:
review_movie_id_to_imdb_id = {} # converting id from rt to id from imdb

overlap_male_imdb = [] # list of overlapped IMDB movies
overlap_male_rt = [] # list of overlapped review movies

# iterate through movies in IMDB data
for i in movie_titles:
    if i.replace(" ", "_") in male_review_movie_list:
        overlap_male_imdb.append(i) # add IMDB title to overlap list
        review_movie_id_to_imdb_id[i.replace(" ", "_")] = i # dict[Review Movie Title] = row in IMDB data
        overlap_male_rt.append(i.replace(" ", "_"))

overlap_female_imdb = []
overlap_female_rt = []

for i in movie_titles:
    if i.replace(" ", "_") in female_review_movie_list:
        overlap_female_imdb.append(i)
        overlap_female_rt.append(i.replace(" ", "_"))
        
overlap_intr = set(overlap_male_imdb) & set(overlap_female_imdb)
overlap_union = set(overlap_male_imdb) | set(overlap_female_imdb)

In [21]:
print(len(set(overlap_male_rt)))
print(len(overlap_intr))
print(len(overlap_union))
print(len(set(overlap_female_rt)))

818
818
818
818


### Important Output: rt_movies(list of overlapped review movies), movie_to_idx (movie name from review to index in rt_movies)

In [61]:
rt_movies = list(set(overlap_male_rt)) # all rt movies in overlap
movie_to_idx = {} # Review Movie Title to index in rt_movies
for i in range(len(rt_movies)):
    movie_to_idx[rt_movies[i]] = i

## Make List of Reviewers

### Important Output: reviewers, reviewers_lst, gender_lst

In [57]:
reviewers = defaultdict(list) # dict, key=Reviewer name, value=list of reviews
for i in male_movie_to_review: # i is movie
    for j in male_movie_to_review[i]: # j is review within movie
        if (j["movie_id"] in rt_movies):
            j["gender"] = "m"
            reviewers[j["name"]].append((j,"m"))
for i in female_movie_to_review:
    for j in female_movie_to_review[i]:
        if (j["movie_id"] in rt_movies):
            j["gender"] = "f"
            reviewers[j["name"]].append((j,"f"))
reviewers_lst = [] # just names of each reviewer
gender_lst = [] # gender of each reviewer
for i in reviewers:
    reviewers_lst.append(i)
    gender_lst.append(reviewers[i][0][1])

In [58]:
print("There are " + str(len(reviewers_lst)) + " reviewers total")

There are 2482 reviewers total


### Important output: reviewer_to_idx

In [62]:
num_reviewers = len(reviewers)
reviewer_to_idx = {} # name -> idx in reviewers_lst
for i in range(len(reviewers_lst)):
    reviewer_to_idx[reviewers_lst[i]]=i
print(len(reviewers_lst))
print(num_reviewers)

2482
2482


## Make Review matrix (reviewer by movie)

In [69]:
review_matrix = np.zeros((num_reviewers,len(rt_movies)))
for i in reviewers: # go through each individual
    row = reviewer_to_idx[i]
    for j in reviewers[i]: # go through each movie
        j=j[0]
        col = movie_to_idx[j["movie_id"]]
        if (j["icon"] == "rotten"):
            review_matrix[row][col] = -1 # negative
        else:
            review_matrix[row][col] = 1 # positive, 0 is neutral

### Just a sanity check to make sure matrix is filled

In [70]:
# review_matrix is reviewer by movie
count_reviews = np.zeros(num_reviewers)
for i in range(num_reviewers):
    for j in range(len(rt_movies)):
        if (review_matrix[i][j]==-1 or review_matrix[i][j]==1):
            count_reviews[i]+=1
print(count_reviews)

[ 14.   1.  52. ...,   1.   1.   2.]


### review_row_idx_to_imdb_row links movie index in review to movie index in IMDB

In [71]:
review_row_idx_to_imdb_row = np.zeros(len(rt_movies))
for i in range(len(rt_movies)):
    idx = movie_to_idx[rt_movies[i]]
    for j in range(len(movie_titles)):
        if movie_titles[j].replace(" ", "_") == rt_movies[i]:
            review_row_idx_to_imdb_row[i]=j

In [73]:
np.shape(review_row_idx_to_imdb_row)

(818,)

In [74]:
np.shape(review_matrix)

(2482, 818)

In [75]:
np.shape(movie_data)

(3801, 45)

In [76]:
np.shape(gender_lst)

(2482,)

In [81]:
gender_lst2 = []
for i in gender_lst:
    if i=="m":
        gender_lst2.append(0)
    else:
        gender_lst2.append(1)

In [82]:
np.savetxt("review_matrix.csv", review_matrix, delimiter=",")

In [83]:
np.savetxt("review_to_imdb.csv", review_row_idx_to_imdb_row, delimiter=",")

In [85]:
np.savetxt("review_to_gender.csv", gender_lst2 ,delimiter=",")