# Task 1: Movie Recommendation


In [1]:
# basic imports
import pandas as pd
import numpy as np


## Subtask 1: Data Loading and Data Preparation

In [2]:
# loading in ratings.dat
df_ratings = pd.read_csv('ratings.dat', delimiter='::', lineterminator='\n', header=None, engine='python')
df_ratings.head()


Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


After looking at some rows in `df_ratings`, it is prepared for further calculations:
 - The `timestamp` column is deleted as hinted.
 - Columns renamings.
 - All users which have rated less than 100 movies are dropped.


In [3]:
# drop the timestamp column
df_ratings.drop(columns=[3], inplace=True)

# rename the remaining columns
df_ratings.columns = ['UserID', 'MovieID', 'Rating']

# Get the IDs that appear at least 100 times
id_counts = df_ratings['UserID'].value_counts()
valid_ids = id_counts[id_counts >= 100].index

# Filter the DataFrame to keep only the rows with valid IDs
df_ratings = df_ratings[df_ratings['UserID'].isin(valid_ids)]

In [4]:
# loading in users.dat
df_users = pd.read_csv('users.dat', delimiter='::', lineterminator='\n', header=None, engine='python')
df_users.head()


Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


After looking at some rows in `df_users`, it is prepared for further calculations:
 - The `zip-code` column is deleted as hinted.
 - Column renamings.
 - Dummy variables are created from the `Occupation` column.
 - The `Age` variable is converted into a column called `Age_group`, which ranges from 1 to 5, meaning that it is still a numerical variable because I wanted to keep the ordering, but the distances are equal.
 - The original `Occupation` and `Age` columns are deleted.
 - The `Gender` column is converted into a binary variable column, Male is flagged by 1, Female by 0.

In [5]:
# drop the zip-code column
df_users.drop(columns=[4], inplace=True)

# rename the remaining columns
df_users.columns = ['UserID', 'Gender', 'Age', 'Occupation']

# create mapping dictionaries in order to create the occupation dummy variables with the corresping names instead of the numbers
d_occupations = {
    0:  "other or not specified",
    1:  "academic/educator",
    2:  "artist",
    3:  "clerical/admin",
    4:  "college/grad student",
    5:  "customer service",
    6:  "doctor/health care",
    7:  "executive/managerial",
    8:  "farmer",
    9:  "homemaker",
    10:  "K-12 student",
    11:  "lawyer",
    12:  "programmer",
    13:  "retired",
    14:  "sales/marketing",
    15:  "scientist",
    16:  "self-employed",
    17:  "technician/engineer",
    18:  "tradesman/craftsman",
    19:  "unemployed",
    20:  "writer"
}

# create the dummy variables
dummy_df = pd.get_dummies(df_users['Occupation'], drop_first=True).rename(columns=d_occupations)

# add them to the initial dataframe
df_users = pd.concat([df_users, dummy_df], axis=1)

# create a mapping dictionary to convert Age to Age_group
d_age_group = {
    1: 1,
    18: 2,
    25: 3,
    35: 4,
    45: 5,
    50: 6,
    56: 7
}

# map the variable
df_users['Age_group'] = df_users['Age'].map(d_age_group)

# drop the original columns Occupation and Age
df_users.drop(columns=['Occupation', 'Age'], axis=1, inplace=True)

# convert the Gender variable into binary
df_users = pd.get_dummies(df_users, columns=['Gender'], prefix='gender', drop_first=True)


In [6]:
# loading in movies.dat
df_movies = pd.read_csv('movies.dat', delimiter='::', lineterminator='\n', header=None, engine='python')
df_movies.head()


Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


After looking at some rows in `df_movies`, it is prepared for further calculations:
 - The Title and Year variables are extracted from column 1.
 - Dummy variables are created for the genres.
 - Columns 1 and 2 are deleted.
 - Column 0 is renamed.

In [7]:
# extract the title and year from column 1

df_movies['Title'] = df_movies[1].str[:-7]
df_movies['Year'] = df_movies[1].str[-5:-1]

# a list is created containing the genres of each movie
splitted_genres = [genres.split("|") for genres in df_movies[2].unique()]

# a set is created, which contains every genre exactly once
movie_genres = set()

for genres in splitted_genres:
    movie_genres.update(genres)

# it is transformed into a list
movie_genres = list(movie_genres)

# for each genre a dummy variable is added to the dataframe, containing 1 if the given movie is of that genre
for genre in movie_genres:
    df_movies[genre] = df_movies[2].str.contains(genre)

# drop the original columns 1 and 2
df_movies.drop(columns=[1, 2], inplace=True)

# rename column 0
df_movies.rename(columns={0: 'MovieID'}, inplace=True)


Once our dataframes are processed we can merge them into a master dataframe.

In [8]:
# merge df_ratings and df_users
df_master = pd.merge(df_ratings, df_users, how='left', on='UserID')

# merge also df_movies
df_master = pd.merge(df_master, df_movies, how='left', on='MovieID')

# once the master dataframe is created we drop the columns we cannot use for building models
x = df_master.drop(columns=['UserID', 'MovieID', 'Rating', 'Title']).astype(int)

# we extract the column containing the target variable
y = df_master[['Rating']].astype(int)


Split the data into train set and test set as required.

In [9]:
# Identify the user IDs for the test set
test_user_ids = range(1, 1001)

# Create a mask for rows corresponding to the test set
test_mask = df_master['UserID'].isin(test_user_ids)

# Create the test set
x_test = x[test_mask]
y_test = y[test_mask]

# Create the training set by excluding the test set
x_train = x[~test_mask]
y_train = y[~test_mask]

print(f"The test data contains {len(x_test)} samples.")
print(f"The train data contains {len(x_train)} samples.")


The test data contains 128025 samples.
The train data contains 719277 samples.


Using the whole data did not result in better performance, thus only a small portion of the training set is used. This way the performance does not decrease, but the runtime of the training is significantly lower.

In [10]:
# set the seed in order to keep reproducibility
seed_value = 42
np.random.seed(seed_value)

# specify the percentage of data to use
percentage_to_use = 10

# calculate the number of samples to use
num_samples = int(0.01 * percentage_to_use * len(x_train))

# use the sample method to randomly select the subset
x_train_subset = x_train.sample(n=num_samples, random_state=seed_value)
y_train_subset = y_train.loc[x_train_subset.index]

# reset the index
x_train_subset.reset_index(drop=True, inplace=True)
y_train_subset.reset_index(drop=True, inplace=True)

print(f"The actual train data contains {len(x_train_subset)} samples.")


The actual train data contains 71927 samples.


## Subtask 2: Basic Movie Recommendation


In [11]:
# imports for modeling
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


In [12]:
# the values to consider for the regularization parameter
possible_c_values = {'C': [0.1, 1, 10]}

# constructing the model type and the number of folds for cross-validation
SVM = LinearSVC(multi_class = 'ovr')
cv_svm = 5
svm_clf = GridSearchCV(SVM, possible_c_values, cv=cv_svm, scoring='accuracy')

# training the model
svm_clf.fit(x_train_subset, y_train_subset)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


GridSearchCV(cv=5, estimator=LinearSVC(), param_grid={'C': [0.1, 1, 10]},
             scoring='accuracy')

In [13]:
# calculating and stating the best performing SVM model's parameter and accuracy on the test set
svm_test_accuracy = svm_clf.score(x_test, y_test)
best_c = svm_clf.best_params_['C']
print(f"SVM Test Accuracy: {svm_test_accuracy} with parameter C set to {best_c}.")


SVM Test Accuracy: 0.3330911931263425 with parameter C set to 0.1.


Training and evaluating an MLP classifier

In [14]:
# the values to consider for the parameter hidden_layer_sizes
possible_hidden_layer_sizes = {'hidden_layer_sizes': [(5,), (10,), (20,), (5, 5), (10, 10)]}

# constructing the model type and the number of folds for cross-validation
MLP = MLPClassifier(random_state=42)
cv_mlp = 5
mlp_clf = GridSearchCV(MLP, possible_hidden_layer_sizes, cv=cv_mlp)

# training the model
mlp_clf.fit(x_train_subset, y_train_subset)


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


GridSearchCV(cv=5, estimator=MLPClassifier(random_state=42),
             param_grid={'hidden_layer_sizes': [(5,), (10,), (20,), (5, 5),
                                                (10, 10)]})

In [15]:
# calculating and stating the best performing MLP model's parameter and accuracy on the test set
mlp_test_accuracy = mlp_clf.score(x_test, y_test)
best_hidden_layer_sizes = mlp_clf.best_params_['hidden_layer_sizes']
print(f"MLPClassifier Test Accuracy: {mlp_test_accuracy} with parameter hidden layer sizes set to {best_hidden_layer_sizes}.")


MLPClassifier Test Accuracy: 0.3532981839484476 with parameter hidden layer sizes set to (5, 5).


Calculating the best performance that can be achieved using a constant prediction.

In [16]:
# look for the most frequent class
most_frequent_class = y_test.value_counts().idxmax()[0]
print(f"The most frequent class in the test set is: {most_frequent_class}")

# build a "prediction" vector which only contains this very value
y_pred = pd.DataFrame({'Rating': [most_frequent_class] * len(y_test)})

# report the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"The best accuracy with constant prediction: {accuracy}")


The most frequent class in the test set is: 4
The best accuracy with constant prediction: 0.35360281195079085


For the SVM I tried 0.1, 1 and 10 as possible values for the regularization parameter. The chosen one is 0.1, but it still only has a test accuracy below 30%, which is better, than random assignment, but worse than using the most frequent class as constant prediction.

For the MLP I tried (5,), (10,), (20,), (5, 5) and (10, 10) as possible values for the hidden layer sizes parameter. The chosen one is (5, 5), which still resulted in a slightly worse accuracy, than using the most frequent class as constant prediction.

## Subtask 3: Classifier Evaluation I
The functions to be used are written

In [17]:
def create_confusion_matrix(df_y, df_y_pred):
    """
    Compute the confusion matrix.

    Parameters:
    - df_y: True labels of the data ranging from 1 to n
    - df_y_pred: Predicted labels by the classifier ranging from min 1 to max n

    Returns:
    - Confusion matrix as a 2D list
    """
    num_classes = int(max(max(df_y.values), max(df_y_pred.values)))
    confusion_matrix = [[0] * num_classes for _ in range(num_classes)]

    for y, y_pred in zip(df_y.values, df_y_pred.values):
        y = int(y)
        y_pred = int(y_pred)
        confusion_matrix[y - 1][y_pred - 1] += 1

    return confusion_matrix

def print_confusion_matrix(confusion_matrix):
    """
    Print the confusion matrix.

    Parameters:
    - confusion_matrix: Confusion matrix as a 2D list
    """
    
    num_classes = len(confusion_matrix)
    
    # Print header
    header = "\t".join([f"Pred_{i+1}" for i in range(num_classes)])
    print(f"\t{header}")

    # Print confusion matrix
    for i in range(num_classes):
        row = "\t".join([str(confusion_matrix[i][j]) for j in range(num_classes)])
        print(f"True_{i+1}\t{row}")
        
def calculate_accuracy(conf_matrix):
    """
    Calculate the classification accuracy from the confusion matrix.

    Parameters:
    - conf_matrix: Confusion matrix as a 2D list

    Returns:
    - Classification accuracy
    """
    total_correct = sum(conf_matrix[i][i] for i in range(len(conf_matrix)))
    total_samples = sum(sum(row) for row in conf_matrix)
    
    accuracy = total_correct / total_samples
    return accuracy


In [18]:
# predictions using the best SVM model
y_pred_svm = pd.DataFrame(svm_clf.predict(x_test))

# compute the confusion matrix
conf_matrix_svm = create_confusion_matrix(y_test, y_pred_svm)

# print the confusion matrix
print_confusion_matrix(conf_matrix_svm)

# calculate the accuracy for SVM
accuracy_svm = calculate_accuracy(conf_matrix_svm)
print(f"SVM Classification Accuracy: {accuracy_svm}")


	Pred_1	Pred_2	Pred_3	Pred_4	Pred_5
True_1	0	0	0	5742	1194
True_2	0	0	0	12008	2397
True_3	0	0	0	28174	6396
True_4	0	0	0	34953	10317
True_5	0	0	0	19153	7691
SVM Classification Accuracy: 0.3330911931263425


In [19]:
# predictions using the best MLP model
y_pred_mlp = pd.DataFrame(mlp_clf.predict(x_test))

# compute the confusion matrix
conf_matrix_mlp = create_confusion_matrix(y_test, y_pred_mlp)

# print the confusion matrix
print_confusion_matrix(conf_matrix_mlp)

# calculate accuracy for MLP
accuracy_mlp = calculate_accuracy(conf_matrix_mlp)
print(f"MLP Classification Accuracy: {accuracy_mlp}")

	Pred_1	Pred_2	Pred_3	Pred_4	Pred_5
True_1	0	0	1059	5871	6
True_2	0	0	1465	12933	7
True_3	0	0	2697	31840	33
True_4	0	0	2728	42483	59
True_5	0	0	1339	25454	51
MLP Classification Accuracy: 0.3532981839484476


The structures of the confusion matrices significantly differ. In the case of the SVM, the model does not predict anything to have a rating other than 4 or 5, and the majority of the samples is predicted to be a 4. In the case of the MLP, an even bigger majority of the predictions is 4, and everything else belongs to the adjacent classes: 3 and 5. This means that the training data is heavily biased towards the rating 4.