In [7]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("./music_clean.csv")
df = df.drop("Unnamed: 0", axis = 1)
df.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,genre
0,60.0,0.896,0.726,214547.0,0.177,2e-06,0.116,-14.824,0.0353,92.934,0.618,1
1,63.0,0.00384,0.635,190448.0,0.908,0.0834,0.239,-4.795,0.0563,110.012,0.637,1
2,59.0,7.5e-05,0.352,456320.0,0.956,0.0203,0.125,-3.634,0.149,122.897,0.228,1
3,54.0,0.945,0.488,352280.0,0.326,0.0157,0.119,-12.02,0.0328,106.063,0.323,1
4,55.0,0.245,0.667,273693.0,0.647,0.000297,0.0633,-7.787,0.0487,143.995,0.3,1


In [9]:
# make some missing values randomly on random columns on random rows inside the data
for i in range(0, 200):
  df.iloc[np.random.randint(0, len(df)), np.random.randint(0, len(df.columns) - 1)] = np.nan

# Without Pipeline

In [10]:
X = df.drop("genre", axis = 1)
y = df["genre"]

x_imputer = SimpleImputer(strategy = "mean")
scaler = StandardScaler()
X_imputed = x_imputer.fit_transform(X)
X_imputed_scaled = scaler.fit_transform(X_imputed)

In [11]:
logreg = LogisticRegression()
params = {
  "tol": np.linspace(0.01, 1.0, 20),
  "C": np.linspace(0.01, 1.0, 20),
  "class_weight": ["balanced", {0:0.8, 1:0.2}],
  "solver": ["newton-cg", "newton-cholesky", "lbfgs", "liblinear", "sag", "saga"]
}
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

logreg_cv = RandomizedSearchCV(logreg, params, cv = kf)
logreg_cv.fit(X_imputed_scaled, y)
print(logreg_cv.best_params_)
print(logreg_cv.best_score_)

{'tol': 0.9478947368421053, 'solver': 'sag', 'class_weight': 'balanced', 'C': 0.16631578947368422}
0.8530000000000001


# With Pipelining

In [12]:
steps = [
  (
    "imputation", 
    SimpleImputer(strategy = "mean")
  ),
  (
    "standardization", 
    StandardScaler()
  ),
  (
    "logistic_regression", 
    LogisticRegression(
      tol = 0.11421052631578947, 
      solver = "lbfgs", 
      class_weight = "balanced"
    )
  )
]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(pipeline.score(X_test, y_test))

0.8433333333333334
