In [1]:
import pandas as pd

In [2]:
path = r"C:\Users\beall\ColabDocs\Bootcamp\Final Project\data1_combined_df.csv"

d_and_f = pd.read_csv(path)
d_and_f.columns

Index(['artist', 'song', 'duration_ms', 'explicit', 'year', 'popularity',
       'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'genre', 'artist_id', 'followers'],
      dtype='object')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer

# feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression, RFECV, SelectFromModel

# Regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error     #for mean_squared_error and root_mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score  

from sklearn import set_config
set_config(transform_output="pandas")

### Preprocessing

In [4]:
d_and_f.drop_duplicates(inplace=True)

In [5]:
d_and_f.set_index(['song','artist_id'], inplace=True)

In [6]:
y = d_and_f.pop("popularity")

In [7]:
X = d_and_f.copy()

In [8]:
from sklearn.model_selection import train_test_split

#the order if the x´s and y´s has to be the following
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)

In [9]:
X_num_columns = X.select_dtypes(include="number").columns

In [10]:
numeric_pipe = make_pipeline(SimpleImputer(strategy="mean"))

In [11]:
X_cat_columns = X.select_dtypes(exclude="number").columns

In [12]:
categoric_pipe = make_pipeline(
    SimpleImputer(strategy="most_frequent", fill_value="N_A"),
    OneHotEncoder(sparse_output=False, handle_unknown= "infrequent_if_exist")
)

In [13]:
preprocessor = make_column_transformer(
        (numeric_pipe, X_num_columns),
        (categoric_pipe, X_cat_columns),
)
preprocessor

### Model processing

In [14]:
from sklearn.neighbors import KNeighborsRegressor

In [15]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

In [16]:
pipe_Kneighbors = make_pipeline(preprocessor,
                                MinMaxScaler(),
                                VarianceThreshold(threshold=0.0),
                                SelectKBest(score_func=mutual_info_regression, k=10),
                                KNeighborsRegressor())

In [17]:
pipe_Kneighbors.fit(X_train, y_train)

In [18]:
y_train_pred = pipe_Kneighbors.predict(X_train)

In [19]:
y_test_pred = pipe_Kneighbors.predict(X_test)

In [20]:
Kneighbors_mae = mean_absolute_error(y_true = y_test, y_pred = y_test_pred)
Kneighbors_rmse = mean_squared_error(y_true = y_test, y_pred = y_test_pred, squared=False)
Kneighbors_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = y_test_pred)
Kneighbors_r2 = r2_score(y_true = y_test, y_pred = y_test_pred)

In [21]:
pd.DataFrame({"MAE": [Kneighbors_mae],
              "RMSE": [Kneighbors_rmse],
              "MAPE": [Kneighbors_mape],
              "R2": [Kneighbors_r2]},
             index=["Kneighbors"])

Unnamed: 0,MAE,RMSE,MAPE,R2
Kneighbors,15.793316,23.257808,1.764624e+16,-0.115032


In [23]:
trained_pipe = pipe_Kneighbors
# add cvs
#trained_pipe.fit(X_train, y_train)

In [31]:
# store the trained pipeline
import pickle
pickle.dump(trained_pipe,
            open(file='trained_pipe_knn.sav',
                 mode='wb'))