## Final modeling 

*Best methods for each dataset and final prediction*

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.feature_selection import RFE, SelectFromModel, SelectKBest, mutual_info_classif
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")

sns.set_style('whitegrid')
sns.set_palette('dark')

rs=123

### Data preparation

In [2]:
# loading data

# artificial dataset
art_x_train = pd.read_csv('./data/artificial_train.data', sep=" ", header=None)
art_x_test = pd.read_csv('./data/artificial_valid.data', sep=" ", header=None)
art_y_train = np.array(pd.read_csv('./data/artificial_train.labels', header=None)[0].tolist())

# digits dataset
dig_x_train = pd.read_csv('./data/digits_train.data', sep=" ", header=None)
dig_x_test = pd.read_csv('./data/digits_valid.data', sep=" ", header=None)
dig_y_train = np.array(pd.read_csv('./data/digits_train.labels', header=None)[0].tolist())

In [3]:
# dropping unnecessary NaN columns
art_x_train.drop(columns=art_x_train.columns[-1], axis=1, inplace=True)
art_x_test.drop(columns=art_x_test.columns[-1], axis=1, inplace=True)
dig_x_train.drop(columns=dig_x_train.columns[-1], axis=1, inplace=True)
dig_x_test.drop(columns=dig_x_test.columns[-1], axis=1, inplace=True)

In [4]:
# scaling - można spróbować innej metody i porównać
std = StandardScaler()
art_x_train_scaled = std.fit_transform(art_x_train)
art_x_test_scaled = std.transform(art_x_test)

std = StandardScaler()
dig_x_train_scaled = std.fit_transform(dig_x_train)
dig_x_test_scaled = std.transform(dig_x_test)

### Artificial dataset - best result

In [5]:
# select k best
k=100

In [6]:
selector = SelectKBest(score_func=mutual_info_classif, k=k)
selector.fit(art_x_train_scaled, art_y_train)
art_x_train_kbest = selector.transform(art_x_train_scaled)
art_x_test_kbest = selector.transform(art_x_test_scaled)

In [7]:
# rfe  
n_of_features = 7 # 10 cv

In [8]:
clf = RandomForestClassifier(random_state=rs)
rfe = RFE(clf, n_features_to_select=n_of_features)
rfe.fit(art_x_train_kbest, art_y_train)
art_x_train_rfe = rfe.transform(art_x_train_kbest)
art_x_test_rfe = rfe.transform(art_x_test_kbest)

In [9]:
rf = RandomForestClassifier(random_state=rs)
rf.fit(art_x_train_rfe, art_y_train)
y_pred = rf.predict_proba(art_x_test_rfe)

In [10]:
features = [ int(feat.replace('x', '')) for feat in rfe.get_feature_names_out()]
features = pd.DataFrame(pd.Series(features))
features.columns=['AGAMAK']
features.to_csv('AGAMAK_artificial_features.txt', sep='\t', index=False)

In [11]:
probs = pd.DataFrame(pd.DataFrame(y_pred)[1])
probs.columns=['AGAMAK']
probs.to_csv('AGAMAK_artificial_prediction.txt', sep='\t', index=False)

### Digits dataset - best result

In [12]:
# l1
c = 0.0025
selector = SelectFromModel(LinearSVC(C=c, penalty="l1", dual=False, random_state=rs).fit(dig_x_train_scaled, dig_y_train), prefit=True)
selector.fit(dig_x_train_scaled, dig_y_train)

In [13]:
dig_x_train_l1 = selector.transform(dig_x_train_scaled)
dig_x_test_l1 = selector.transform(dig_x_test_scaled)

In [14]:
dig_x_train_l1.shape

(6000, 106)

In [15]:
estimator = RandomForestClassifier(random_state=rs) 
estimator.fit(dig_x_train_l1, dig_y_train)
y_pred = estimator.predict_proba(dig_x_test_l1)

In [16]:
features = [ int(feat.replace('x', '')) for feat in selector.get_feature_names_out()]
features = pd.DataFrame(pd.Series(features))
features.columns=['AGAMAK']
features.to_csv('AGAMAK_digits_features.txt', sep='\t', index=False)

In [17]:
probs = pd.DataFrame(pd.DataFrame(y_pred)[1])
probs.columns=['AGAMAK']
probs.to_csv('AGAMAK_digits_prediction.txt', sep='\t', index=False)