In [1]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.7.0-py2.py3-none-any.whl (28.8 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.7.0 tenacity-8.0.1


In [2]:
!pip install plotly --upgrade



In [3]:
from datetime import datetime, date

In [4]:
import pandas as pd

In [5]:
import numpy as np

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [7]:
movies_df = pd.read_csv('./data_preparation/movies.csv', on_bad_lines='skip', sep=";")

In [8]:
ratings_df = pd.read_csv('./data_preparation/ratings.csv', sep=";")

In [9]:
users_df = pd.read_csv('./data_preparation/users.csv', sep=",")

In [10]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3882 entries, 0 to 3881
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3882 non-null   int64 
 1   Title    3882 non-null   object
 2   Genres   3882 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB


In [15]:
ratings_df.drop(['Timestamp'], axis=1, inplace=True)

In [16]:
users_df['birthday'] = users_df['birthday'].str.replace('/0/','/1/')

Função para calcular a idade da pessoa

In [17]:
def age(born):
    today = date.today()
    return today.year - born.year - ((today.month, 
                                      today.day) < (born.month, 
                                                    born.day))

In [18]:
users_df['birthday'] = pd.to_datetime(users_df['birthday'], format='%m/%d/%Y')

In [19]:
users_df['Age'] = users_df['birthday'].apply(age)

In [20]:
users_df = users_df.dropna()

In [22]:
users_df.drop(['birthday', 'Zip-code'], axis=1, inplace=True)

In [24]:
users_df.loc[users_df['Age']<18, 'age_group'] = 1
users_df.loc[users_df['Age'].between(18,24), 'age_group'] = 18
users_df.loc[users_df['Age'].between(25,34), 'age_group'] = 25
users_df.loc[users_df['Age'].between(35,44), 'age_group'] = 35
users_df.loc[users_df['Age'].between(45,49), 'age_group'] = 45
users_df.loc[users_df['Age'].between(50,55), 'age_group'] = 55
users_df.loc[users_df['Age']>55, 'age_group'] = 56

In [26]:
users_df.drop(['name', 'Age'], axis=1, inplace=True)

In [39]:
m = pd.merge(ratings_df, users_df, how = 'inner', on = 'UserID')

In [40]:
m = pd.merge(m, movies_df, how = 'inner', on = 'MovieID')

In [41]:
m = m.drop(['Title'], axis = 1)

In [54]:
m = m[['UserID','Rating','MovieID','Gender','Occupation','age_group','Genres']]

Os atributos foram separados em preditores e de classe

In [44]:
X_ratings = m.iloc[:,2:7].values

In [56]:
y_ratings = m.iloc[:,1].values

In [57]:
from sklearn.preprocessing import LabelEncoder

Transformação de atributos categóricos em números inteiros

In [59]:
label_encoder_Gender = LabelEncoder()
label_encoder_Genres = LabelEncoder()

In [61]:
X_ratings[:,1] = label_encoder_Gender.fit_transform(X_ratings[:,1])
X_ratings[:,4] = label_encoder_Genres.fit_transform(X_ratings[:,4])

Conversão de narrays em dataframes

In [62]:
column_names_X = ["MovieID","Gender", "Occupation", "age_group", "Genres"]
column_names_y = ["Rating"]

In [63]:
X_ratings = pd.DataFrame(X_ratings, columns=column_names_X)
y_ratings = pd.DataFrame(y_ratings, columns=column_names_y)

Árvore de Decisão

In [65]:
class TreeClassifier:
  def fit(self, input, output):
    data = input.copy()
    data['Ratings'] = output
    self.tree = self.decision_tree(data, data, input.columns, 'Ratings')
    return self.tree
    

  def predict(self, input):

    samples = input.to_dict(orient='records')
    predictions = []


    for sample in samples:
      predictions.append(self.make_prediction(sample, self.tree, 1.0))

    return predictions

  def entropy(self, attribute_column):

    values, counts = np.unique(attribute_column, return_counts=True)


    entropy_list = []

    for i in range(len(values)):
      probability = counts[i]/np.sum(counts)
      entropy_list.append(-probability*np.log2(probability))


    total_entropy = np.sum(entropy_list)

    return total_entropy

  def information_gain(self, data, feature_attribute_name, target_attribute_name):
    # find total entropy of given subset
    total_entropy = self.entropy(data[target_attribute_name])


    values, counts = np.unique(data[feature_attribute_name], return_counts=True)


    weighted_entropy_list = []

    for i in range(len(values)):
      subset_probability = counts[i]/np.sum(counts)
      subset_entropy = self.entropy(data.where(data[feature_attribute_name]==values[i]).dropna()[target_attribute_name])
      weighted_entropy_list.append(subset_probability*subset_entropy)

    total_weighted_entropy = np.sum(weighted_entropy_list)


    information_gain = total_entropy - total_weighted_entropy

    return information_gain

  def decision_tree(self, data, original_data, feature_attribute_names, target_attribute_name, parent_node_class=None):

    unique_classes = np.unique(data[target_attribute_name])
    if len(unique_classes) <= 1:
      return unique_classes[0]

    elif len(data) == 0:
      majority_class_index = np.argmax(np.unique(original_data[target_attribute_name], return_counts=True)[1])
      return np.unique(original_data[target_attribute_name])[majority_class_index]

    elif len(feature_attribute_names) == 0:
      return parent_node_class

    else:

      majority_class_index = np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])
      parent_node_class = unique_classes[majority_class_index]


      ig_values = [self.information_gain(data, feature, target_attribute_name) for feature in feature_attribute_names]
      best_feature_index = np.argmax(ig_values)
      best_feature = feature_attribute_names[best_feature_index]

      tree = {best_feature: {}}


      feature_attribute_names = [i for i in feature_attribute_names if i != best_feature]


      parent_attribute_values = np.unique(data[best_feature])
      for value in parent_attribute_values:
        sub_data = data.where(data[best_feature] == value).dropna()

        subtree = self.decision_tree(sub_data, original_data, feature_attribute_names, target_attribute_name, parent_node_class)

        tree[best_feature][value] = subtree

      return tree

  def make_prediction(self, sample, tree, default=1):
    for attribute in list(sample.keys()):

      if attribute in list(tree.keys()):
        try:
          result = tree[attribute][sample[attribute]]
        except:
          return default

        result = tree[attribute][sample[attribute]]

        if isinstance(result, dict):
          return self.make_prediction(sample, result)
        else:
          return result

In [67]:
model = TreeClassifier()
model.fit(X_ratings, y_ratings)


{'MovieID': {1: {'Occupation': {0: {'age_group': {1.0: {'Gender': {0: {'Genres': {145: 5.0}},
        1: {'Genres': {145: 4.0}}}},
      18.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 5.0}}}},
      25.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 4.0}}}},
      35.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 5.0}}}},
      45.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 5.0}}}},
      55.0: {'Gender': {0: {'Genres': {145: 2.0}}, 1: {'Genres': {145: 4.0}}}},
      56.0: {'Gender': {0: {'Genres': {145: 4.0}},
        1: {'Genres': {145: 4.0}}}}}},
    1: {'age_group': {1.0: {'Gender': {0: {'Genres': {145: 4.0}},
        1: {'Genres': {145: 5.0}}}},
      18.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 4.0}}}},
      25.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 5.0}}}},
      35.0: {'Gender': {0: {'Genres': {145: 5.0}}, 1: {'Genres': {145: 4.0}}}},
      45.0: {'Gender': {0: {'Genres'

In [153]:
test_df = pd.read_csv('./test.csv', sep=";")

In [155]:
X_test = test_df.drop(columns = 'Rating')

In [157]:
y_test = test_df['Rating']

In [178]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, cohen_kappa_score

In [179]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.3

In [182]:
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [2, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 4, 1]], dtype=int64)

In [183]:
mean_squared_error(y_test, y_pred)

1.3

In [193]:
cohen_kappa_score( y_pred, y_priori)

-0.06060606060606055

In [194]:
priori_df = pd.read_csv('./data_preparation/ratings.csv', sep=";")

In [197]:
mean_df = priori_df.loc[priori_df['MovieID'] == 1]

In [198]:
mean_df.describe()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
count,2077.0,2077.0,2077.0,2077.0
mean,3053.819933,1.0,4.146846,970558600.0
std,1750.606746,0.0,0.852349,9207399.0
min,1.0,1.0,1.0,956712800.0
25%,1532.0,1.0,4.0,965097600.0
50%,3146.0,1.0,4.0,971124300.0
75%,4571.0,1.0,5.0,974877400.0
max,6040.0,1.0,5.0,1045711000.0


O y_priori foi estabelecido manualmente a partir do valor da média de mean_df.describe()

In [191]:
y_priori = [4,4,4,3,4,4,3,4,3,4]

In [185]:
accuracy_score(y_priori, y_pred)

0.3

In [186]:
mean_squared_error(y_priori, y_pred)

1.6

In [187]:
confusion_matrix(y_priori, y_pred)

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [2, 0, 0, 1, 0],
       [0, 1, 2, 3, 1],
       [0, 0, 0, 0, 0]], dtype=int64)

In [190]:
cohen_kappa_score(y_test, y_pred)

0.16666666666666674