In [1]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
import pandas as pd
import itertools
import math
import os

In [2]:
train_x = pd.read_csv(os.path.join('data', 'train_x_sep.csv'))
train_y = pd.read_csv(os.path.join('data', 'train_y.csv'))

test_x = pd.read_csv(os.path.join('data', 'test_x_sep.csv'))
test_y = pd.read_csv(os.path.join('data', 'test_y.csv'))

In [3]:
features = ['img', 'category', 'publish_day', 'description', 'title', 'tags']
associated_column_names = {}
for column_name in train_x.columns:
    for feature_name in features:
        if feature_name in column_name:
            if feature_name not in associated_column_names:
                associated_column_names[feature_name] = [column_name]
            else:
                associated_column_names[feature_name].append(column_name)
            break

In [4]:
train_name_to_df = {}
test_name_to_df = {}
for feature_name in features:
    train_name_to_df[feature_name] = train_x[associated_column_names[feature_name]]
    test_name_to_df[feature_name] = test_x[associated_column_names[feature_name]]    

In [5]:
best_l1 = math.inf
best_l1_features = None
best_l2 = math.inf
best_l2_features = None
best_r2 = -1
best_r2_features = None

for features in itertools.combinations(features, 1):
    _train_x = train_name_to_df[features[0]]
#     _train_x = train_name_to_df[features[0]].merge(train_name_to_df[features[1]], left_index=True, right_index=True)
#     _train_x = _train_x.merge(train_name_to_df[features[2]], left_index=True, right_index=True)
    
    _test_x = test_name_to_df[features[0]]
#     _test_x = test_name_to_df[features[0]].merge(test_name_to_df[features[1]], left_index=True, right_index=True)
#     _test_x = _test_x.merge(test_name_to_df[features[2]], left_index=True, right_index=True)
    
    lr = LinearRegression()
    lr.fit(_train_x, train_y)
    
    y_pred = lr.predict(_test_x)
    l1 = mean_absolute_error(test_y, y_pred)
    l2 = mean_squared_error(test_y, y_pred)
    r2 = r2_score(test_y, y_pred)
    
    if l1 < best_l1:
        best_l1 = l1
        best_l1_features = features
        
    if l2 < best_l2:
        best_l2 = l2
        best_l2_features = features
        
    if abs(1 - r2) < abs(1 - best_r2):
        best_r2 = r2
        best_r2_features = features
        
print(f'L1: {best_l1_features} \t {math.log(best_l1)}')
print(f'L2: {best_l2_features} \t {math.log(best_l2)}')
print(f'R2: {best_r2_features} \t {best_r2}')

L1: ('category',) 	 14.771818108410137
L2: ('title',) 	 31.428897498394907
R2: ('title',) 	 0.1503417186728946


In [6]:
print(f'L1: {best_l1_features} \t {math.log(best_l1)}')
print(f'L2: {best_l2_features} \t {math.log(best_l2)}')
print(f'R2: {best_r2_features} \t {best_r2}')

L1: ('category',) 	 14.771818108410137
L2: ('title',) 	 31.428897498394907
R2: ('title',) 	 0.1503417186728946
