In [69]:
import pandas as pd
import pandas_profiling as pp
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)



In [81]:
def plot_item(df):
    data = df.groupby(df.columns[1])[df.columns[-1]].count().clip(upper=50)
    trace = go.Histogram(x = data.values,
                     name = df.columns[-1],
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
    layout = go.Layout(title = 'Distribution Of Number of Ratings Per item',
                   xaxis = dict(title = 'Number of Ratings Per Music'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

In [82]:
def plot_user(df):
    data = df.groupby(df.columns[0])[df.columns[-1]].count().clip(upper=50)
    trace = go.Histogram(x = data.values,
                     name = df.columns[-1],
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
    layout = go.Layout(title = 'Distribution Of Number of Ratings Per user',
                   xaxis = dict(title = 'Number of Ratings Per user'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

In [83]:
def plot_df(df):
    data = df[df.columns[-1]].value_counts().sort_index(ascending=False)
    trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
    layout = dict(title = 'Distribution of {} ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
    fig = go.Figure(data=[trace], layout=layout)
    iplot(fig)

In [84]:
def data_analysis(path1, path2):
    train_netflix = pd.read_csv(path1)
    test_netflix = pd.read_csv(path2)
    list_bin=[0,1,2,3,4,5]
    data = pd.cut(train_netflix[train_netflix.columns[-1]],list_bin)
    print("Train set")
    n_user_train = len(train_netflix[train_netflix.columns[0]].value_counts())
    n_item_train = len(train_netflix[train_netflix.columns[1]].value_counts())
    print(data.value_counts().sort_index())
    print("Number of users:", n_user_train)
    print("Number of items:", n_item_train)
    plot_df(train_netflix)
    plot_user(train_netflix)
    plot_item(train_netflix)
    
    
    data = pd.cut(test_netflix[test_netflix.columns[-1]],list_bin)
    print("Test set")
    n_user_test = len(test_netflix[test_netflix.columns[0]].value_counts())
    n_item_test = len(test_netflix[test_netflix.columns[1]].value_counts())
    print(data.value_counts().sort_index())
    print("Number of users:", n_user_test)
    print("Number of items:", n_item_test)
    plot_df(test_netflix)
    plot_user(test_netflix)
    plot_item(test_netflix)

In [85]:
print("Dataset: Netflix")
data_analysis('train.csv','test.csv')

Dataset: Netflix
Train set
(0, 1]    11023
(1, 2]    26607
(2, 3]    65141
(3, 4]    61920
(4, 5]    28390
Name: user_rating, dtype: int64
Number of users: 5128
Number of items: 528


Test set
(0, 1]       0
(1, 2]      28
(2, 3]     797
(3, 4]    2780
(4, 5]    1511
Name: user_rating, dtype: int64
Number of users: 5116
Number of items: 288


In [86]:
print("Dataset: MovieLens Small")
data_analysis('/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/MovieLenssmall/train.csv','/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/MovieLenssmall/test.csv')

Dataset: MovieLens Small
Train set
(0, 1]     4163
(1, 2]     9316
(2, 3]    25516
(3, 4]    39773
(4, 5]    21610
Name: user_rating, dtype: int64
Number of users: 610
Number of items: 9533


Test set
(0, 1]      0
(1, 2]      0
(2, 3]     20
(3, 4]    258
(4, 5]    331
Name: user_rating, dtype: int64
Number of users: 609
Number of items: 394


In [87]:
print("Dataset: MovieLens 100k")
data_analysis('/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/MovieLens100k/models/train.csv','/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/MovieLens100k/models/test.csv')

Dataset: MovieLens 100k
Train set
(0, 1]     5094
(1, 2]     9824
(2, 3]    23620
(3, 4]    29816
(4, 5]    18218
Name: user_rating, dtype: int64
Number of users: 943
Number of items: 1239


Test set
(0, 1]      0
(1, 2]      1
(2, 3]     47
(3, 4]    533
(4, 5]    362
Name: user_rating, dtype: int64
Number of users: 943
Number of items: 365


In [88]:
print("Dataset: Book")
data_analysis('/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/dbook2014/models/train.csv','/home/jinfeng/Downloads/doctorant/Conférences/machine learning/data_and_codes/dbook2014/models/test.csv')

Dataset: Book
Train set
(0, 1]     1101
(1, 2]     3934
(2, 3]    15924
(3, 4]    24982
(4, 5]    19877
Name: rating, dtype: int64
Number of users: 5576
Number of items: 2680


Test set
(0, 1]       0
(1, 2]       3
(2, 3]     127
(3, 4]    1998
(4, 5]    3348
Name: rating, dtype: int64
Number of users: 5476
Number of items: 1873
