<a href="https://colab.research.google.com/github/atotev/ca683-2021-dara/blob/main/Create_NetFlix_Mstr_DS_Initial_trials.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os
drive.mount('/content/drive/')
os.chdir('/content/drive/My Drive/Netflix_MR/')

Mounted at /content/drive/


In [2]:
!ls

all_sparse_matrix.npz  combined_data_2.txt  combined_data_4.txt  data.csv
combined_data_1.txt    combined_data_3.txt  data1.csv


In [3]:
from datetime import datetime

In [4]:
import pandas as pd
import numpy as np

In [5]:
start = datetime.now()
if not os.path.isfile('data1.csv'):
    # Create a file 'data1.csv' before reading it
    # Read all the individual files from netflix dataset and store them in one big file('data1.csv')
    data = open('data1.csv', mode='a')
    
    row = list()
    files=['combined_data_1.txt','combined_data_2.txt', 
           'combined_data_3.txt', 'combined_data_4.txt']
    for file in files:
        print("Reading ratings from {}...".format(file))
        with open(file) as f:
            for line in f: 
                del row[:]
                line = line.strip()
                if line.endswith(':'):
                    # These are ratings for this movie, until another movie appears in the file.
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
    data.close()
print('Time taken :', datetime.now() - start)

Time taken : 0:00:00.000740


In [6]:
print("creating the pandas dataframe from data1.csv file..")
df = pd.read_csv('data1.csv', sep=',', 
                       names=['movie', 'user','rating','date'])
df.date = pd.to_datetime(df.date)
print('Done.\n')

print('Sorting the dataframe by user..')
df.sort_values(by='user', inplace=True)
print('Done..')

df.head()

creating the pandas dataframe from data1.csv file..
Done.

Sorting the dataframe by user..
Done..


Unnamed: 0,movie,user,rating,date
79334307,14358,6,2,2005-12-04
34092597,6134,6,4,2005-01-12
32652672,5926,6,4,2005-10-26
38173947,6797,6,3,2004-11-10
20618167,3905,6,3,2005-12-04


In [7]:
#df.describe()

In [8]:
print("Total data ")
print("-"*50)
print("\nTotal no of ratings :",df.shape[0])
print("Total No of Users   :", len(np.unique(df.user)))
print("Total No of movies  :", len(np.unique(df.movie)))

Total data 
--------------------------------------------------

Total no of ratings : 100480507
Total No of Users   : 480189
Total No of movies  : 17770


In [9]:
no_of_rated_movies_per_user = df.groupby(by='user')['rating'].count().sort_values(ascending=False)

no_of_rated_movies_per_user.head()

user
305344     17653
387418     17436
2439493    16565
1664010    15813
2118461    14831
Name: rating, dtype: int64

In [10]:
no_of_rated_movies_per_user.describe()

count    480189.000000
mean        209.251997
std         302.339155
min           1.000000
25%          39.000000
50%          96.000000
75%         259.000000
max       17653.000000
Name: rating, dtype: float64

In [11]:
no_of_ratings_per_movie = df.groupby(by='movie')['rating'].count().sort_values(ascending=False)
no_of_ratings_per_movie.describe()

count     17770.000000
mean       5654.502364
std       16909.673269
min           3.000000
25%         192.000000
50%         561.000000
75%        2667.750000
max      232944.000000
Name: rating, dtype: float64

In [12]:
from scipy import sparse
from scipy.sparse import csr_matrix

In [13]:
start = datetime.now()
if os.path.isfile('all_sparse_matrix.npz'):
    print("It is present in your pwd, will get it from disk....")
    # just get it from the disk instead of computing it
    all_sparse_matrix = sparse.load_npz('all_sparse_matrix.npz')
    print("Done..")
else: 
    print("We are creating sparse_matrix from the dataframe..")
    # create sparse_matrix and store it.
    all_sparse_matrix = sparse.csr_matrix((df.rating.values, (df.user.values,
                                               df.movie.values)),)
    
    print('Done. It\'s shape is : (user, movie) : ',all_sparse_matrix.shape)
    print('Saving it into disk..')
    # save it into disk
    sparse.save_npz("all_sparse_matrix.npz", all_sparse_matrix)
    print('Done..\n')

print(datetime.now() - start)

It is present in your pwd, will get it from disk....
Done..
0:00:07.790220


In [14]:
us,mv = all_sparse_matrix.shape
elem = all_sparse_matrix.count_nonzero()

print("Sparsity Of All matrix is : {} % ".format(  (1-(elem/(us*mv))) * 100) )

Sparsity Of All matrix is : 99.78658865580644 % 


In [15]:
all_averages = dict()

In [16]:
def get_average_ratings(sparse_matrix, of_users):
    
    # average ratings of user
    ax = 1 if of_users else 0 # 1 - User axes,0 - Movie axes

    # ".A1" is used to convert Column_Matrix to 1-D numpy array 
    sum_of_ratings = sparse_matrix.sum(axis=ax).A1
    # Boolean matrix of ratings ( A user rated that movie or not)
    is_rated = sparse_matrix!=0
    no_of_ratings = is_rated.sum(axis=ax).A1
    
    # max_user, max_movie ids in sparse matrix 
    u,m = sparse_matrix.shape
    # Dictonary of users and their average ratigns.
    average_ratings = { i : sum_of_ratings[i]/no_of_ratings[i]
                                 for i in range(u if of_users else m) 
                                    if no_of_ratings[i] !=0}
    return average_ratings

In [17]:
# Global average of ratings in our dataset
all_global_average = all_sparse_matrix.sum()/all_sparse_matrix.count_nonzero()
all_averages['global'] = all_global_average
all_averages

{'global': 3.604289964420661}

In [18]:
all_averages['user'] = get_average_ratings(all_sparse_matrix, of_users=True)
print('\nAverage rating of user 25 :',all_averages['user'][25])


Average rating of user 25 : 3.4814814814814814


In [19]:
all_averages['movie'] =  get_average_ratings(all_sparse_matrix, of_users=False)
print('\n AVerage rating of movie 35 :',all_averages['movie'][35])


 AVerage rating of movie 35 : 3.1454112038140645


In [20]:
df1 = df[df['rating']>2]

In [21]:
print("\nTotal no of ratings :",df1.shape[0])
print("Total No of Users   :", len(np.unique(df1.user)))
print("Total No of movies  :", len(np.unique(df1.movie)))


Total no of ratings : 85730437
Total No of Users   : 479760
Total No of movies  : 17770
