In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from configparser import ConfigParser
import json
import os
from os.path import exists
import pickle
import numpy as np
from collections import OrderedDict
import matplotlib.pyplot as plt
import re

from myfm import MyFMRegressor

In [None]:
#Read config.ini file
config = ConfigParser()
config.read("config.ini")
dataset_info = config["DATASETS"]
fm_data_path = dataset_info['fm_path']
fm_data_path_new = dataset_info['fm_path_new']
rf_data_path = dataset_info['rf_path']
train_set_filename = dataset_info['train_filename']
test_set_filename = dataset_info['test_filename']
description_filename = dataset_info['description_filename']

model_info = config["MODELS"]
model_path = model_info['model_path']
fm_filename = model_info['fm_model_filename']
rf_filename = model_info['rf_model_filename']

fs_info = config['FEATURE_SELECTION']
fs_path = fs_info['fs_rf_path']
fs_final_path = fs_info['fs_final_path']

In [None]:
path = dataset_info['ori_path']
genre_cols = pd.read_csv(path + 'ml-100k/u.genre', sep='|', header=None)[0].to_numpy().tolist()

### FM Training and Testing data

In [None]:
train = pickle.load(open(fm_data_path_new + train_set_filename, 'rb'))
test = pickle.load(open(fm_data_path_new + test_set_filename, 'rb'))
X_train = train.drop(['rating'], axis=1)
X_test = test.drop(['rating'], axis=1)
y_train = train['rating']
y_test = test['rating']


with open(fm_data_path + description_filename) as f:
    meta_data = json.load(f)
    group_shapes = [v for k,v in meta_data.items()]

# Baseline Model

In [None]:
fm = MyFMRegressor(rank=10, random_seed = 332)
fm.fit(X_train, y_train, n_iter=200, n_kept_samples=200)

prediction = fm.predict(X_test)
rmse = ((y_test - prediction) ** 2).mean() ** .5
mae = np.abs(y_test - prediction).mean()
print(f'rmse={rmse}, mae={mae}')

---

# Random Forest column names to MyFM columns

In [None]:
# Getting all the methods we have implemeted, file name represents the method name we have immplemented
available_methods_rf = []
for subdir, dirs, files in os.walk(fs_final_path):
    if len(dirs) == 1:
        for file in files:
            available_methods_rf.append(file[:])
pd.DataFrame(available_methods_rf)

In [None]:
# Combining all the data we have in our feature selection folder for randomforest
# Folder names represent the feature seletion method, so creating a key for all methods
selected_col_dict_rf = {}
for i in available_methods_rf:
    selected_col_dict_rf[i] = []
selected_col_dict_rf.keys()

In [None]:
# Storing corresponding json data in the key.
for i in range(len(available_methods_rf)):
    f = open(fs_final_path + available_methods_rf[i] + ".json")
    selected_col_dict_rf[available_methods_rf[i]] = json.load(f)

In [None]:
# # choose the method
# choose_method = 4
# sel_from_model_col_rf = selected_col_dict_rf[available_methods_rf[choose_method]]["columns"].copy()
# available_methods_rf[choose_method]

# Training MyFM model

There are two formats in which we get the columns from feature selection

1. Some feature selection methods give only the specified number of features we want by specifying k. So a separete list is created each time for different values of k and they are imported separately here for training the myFM.

In [None]:
path = 'feature_selection/final/f-classif/'

# Getting all the methods we have implemeted, file name represents the method name we have immplemented
available_methods_sfm = []
for subdir, dirs, files in os.walk(path):
    for file in files:
        available_methods_sfm.append(file[:-5])
pd.DataFrame(available_methods_sfm)

In [None]:
# Storing corresponding json data in the key.
select_from_model_dict = {}
for i in range(len(available_methods_sfm)):
    f = open(path + available_methods_sfm[i] + ".json")
    select_from_model_dict[available_methods_sfm[i]] = json.load(f)

In [None]:
for i in range(1,len(select_from_model_dict)):
    temp = "f-classif_" + str(i*50)
    select_from_model_dict[temp]['columns'][::-1]

In [None]:
def covert_fs_to_fm(cols, X_fm, genre_cols):
    
    '''
    Convert the columns obtained from feature selection methods to fm dataset format
    1. Add user id and movie id, group the features and group selected genres
    
    * cols - all the selected columns
    * X_fm - all the myfm data
    * genre_cols - The total genre list. Used to group them in group shapes.
                   Every genre is a different feaure in random forest. but in MyFM 
                   all the genre should be combined into a single key.                  
    
    '''
    
    # -----------------------------------------------------------------------
    # Preprocessing columns
    # -------------------------------------------X_train_temp_fs-------------
    group_shapes = OrderedDict()
     
    X_train_temp_fs = pd.concat([X_fm.filter(regex = 'user_id'), X_fm.filter(regex = 'movie_id')], axis = 1)
    group_shapes['user_id'] = 943
    group_shapes['movie_id_id'] = 1681
    
    X_temp = X_fm[cols]
    X_temp_cols = X_temp.columns
    
    
    for i in ['timestamp','release_date','age','sex','occupation','zip_code']:
        if len(X_temp.filter(regex = i).columns):
            X_train_temp_fs = pd.concat([X_train_temp_fs, X_temp.filter(regex = i)], axis = 1)
            group_shapes[i] = len(X_temp.filter(regex = i).columns)

    if list(set(X_temp_cols) & set(genre_cols)):
        group_shapes['genre'] = 0
        for col in list(set(X_temp_cols) & set(genre_cols)):
            temp = X_temp.filter(regex=col, axis=1)
            X_train_temp_fs = pd.concat([X_train_temp_fs, temp], axis = 1)
            group_shapes['genre'] = group_shapes['genre'] + 1 
    
    group_shapes_list = [j for i,j in group_shapes.items()]
    return X_train_temp_fs, group_shapes, group_shapes_list

In [None]:
sel_from_model_col_rf = select_from_model_dict["f-classif_" + str(8*50)]['columns'][::-1]
X_train_t, group_shapes_dict, group_shapes = covert_fs_to_fm(sel_from_model_col_rf, X_train, genre_cols)
X_train_t.shape

In [None]:
rmse_results = []

X_train_temp = pd.concat([X_train.filter(regex = 'user_id'), X_train.filter(regex = 'movie_id')], axis = 1)
X_test_temp = pd.concat([X_test.filter(regex = 'user_id'), X_test.filter(regex = 'movie_id')], axis = 1)

for i in range(10,11):

    sel_from_model_col_rf = select_from_model_dict["f-classif_" + str(i*50)]['columns'][::-1]
    
    X_train_final, group_shapes_dict, group_shapes = covert_fs_to_fm(sel_from_model_col_rf, X_train, genre_cols)
    X_test_final, group_shapes_dict, group_shapes = covert_fs_to_fm(sel_from_model_col_rf, X_test, genre_cols)

    print(X_train_final.shape)
    fm = MyFMRegressor(rank=10, random_seed = 332)
    fm.fit(X_train_final, y_train, n_iter=200, n_kept_samples=200, group_shapes = group_shapes)

    prediction = fm.predict(X_test_final)
    rmse = ((y_test - prediction) ** 2).mean() ** .5
    mae = np.abs(y_test - prediction).mean()
    print(f'rmse={rmse}, mae={mae}')
    rmse_results.append(rmse)
    del(fm)

## Tree based methods

Tree based methods have only one feature importance column. The columns are sorted based on the importance. Everytime, top N features are seklected and given to the myFM model

In [None]:
# choose the method
choose_method = 0
sel_from_model_col_rf = selected_col_dict_rf[available_methods_rf[choose_method]]["columns"].copy()

In [None]:
len(sel_from_model_col_rf)

In [None]:
available_methods_rf[choose_method]

In [None]:
rmse_results = []

# choose the method
choose_method = 2
sel_from_model_col_rf = selected_col_dict_rf[available_methods_rf[choose_method]]["columns"].copy()
print(available_methods_rf[choose_method])

X_train_temp = pd.concat([X_train.filter(regex = 'user_id'), X_train.filter(regex = 'movie_id')], axis = 1)
X_test_temp = pd.concat([X_test.filter(regex = 'user_id'), X_test.filter(regex = 'movie_id')], axis = 1)

for k in [len(sel_from_model_col_rf)]:

    X_train_final, group_shapes_dict, group_shapes = covert_fs_to_fm(sel_from_model_col_rf[:k], X_train, genre_cols)
    X_test_final, group_shapes_dict, group_shapes = covert_fs_to_fm(sel_from_model_col_rf[:k], X_test, genre_cols)

    print(X_train_final.shape)
    fm = MyFMRegressor(rank=10, random_seed = 332)
    fm.fit(X_train_final, y_train, n_iter=200, n_kept_samples=200)

    prediction = fm.predict(X_test_final)
    rmse = ((y_test - prediction) ** 2).mean() ** .5
    mae = np.abs(y_test - prediction).mean()
    print(f'rmse={rmse}, mae={mae}')
    rmse_results.append(rmse)

# Save and plot model

In [None]:
rmse_results_copy = rmse_results.copy()
rmse_results.insert(0,0)

In [None]:
colors = ['olive','b','c','m','y','brown','r','g']
plt.figure(figsize=(12,6))
plt.ylim(0.85,0.93)
plt.xlim(550,-50)
for c, i in enumerate(range(len(df.columns))):
    plt.plot([0,50,100,150,200,250,300,350,400,450,500,524], rmse_results.iloc[:,i].values[1:], color = colors[i], label = df.iloc[:,i].name, linestyle = '--', marker = '*')
plt.xlabel("No of Additional Parameters")
plt.ylabel("Validation RMSE")
plt.legend(bbox_to_anchor=(1.31,1))
plt.show()

In [None]:
np.savetxt("feature_selection/final_results/f-classif.csv", 
           rmse_results,
           delimiter =", ", 
           fmt ='% s')