# Preprocessing


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sales_df = pd.read_csv("datasets/advertising_and_sales_clean.csv")
music_df = pd.read_csv("datasets/music_clean.csv")

print(sales_df)

# Create dummy variables

#### Only applicable if one column has categorical data


In [None]:
sales_dummy = pd.get_dummies(sales_df, drop_first=True)
print(sales_dummy.head())

#### If more than one col of categorical data we will drop and concatinate


In [None]:
sales_dummy = pd.get_dummies(sales_df["influencer"], drop_first=True)
print(sales_dummy.head())
print("------------------------------------------------------")

sales_dummy = pd.concat([sales_df, sales_dummy], axis=1)
sales_dummy.drop("influencer", axis=1, inplace=True)
print(sales_dummy.head())

## Binarizing column

In [None]:
# Convert category to a binary feature
sales_df["influencer"] = np.where(sales_df["influencer"] == 'Mega', 1, 0)
print(sales_df)

# Handling missing data

In [None]:
print(music_df.isna().sum().sort_values(ascending=False))

## Exclude categorical data

In [None]:
X = sales_df.select_dtypes(exclude=['object'])
print(X.head())

## Get name of columns with missing data

In [None]:
# Get names of columns with missing values
cols_with_missing = [col for col in music_df.columns
                     if music_df[col].isnull().any()]
print(cols_with_missing)
print(music_df.shape)

# Drop columns in training and validation data
reduced = music_df.drop(cols_with_missing, axis=1)
print(reduced.shape)

## Drop missing values

In [None]:
# print all columns
missing_values = music_df.isna().sum()
print(missing_values)

print("------------------------------------------------------")
# print only missing
print(missing_values[missing_values>0])

print("------------------------------------------------------")

music_df_clean = music_df.dropna()
#or subset
#music_df_clean = music_df.dropna(subset=['valence'])

print(music_df_clean.isna().sum())

## Impute missing values (Fill missing data)
#### Must be split into numerical and categorical data
- Fill with **Mean** for numerical data
- Fill with **Mode** for categorical data

In [None]:
from sklearn.impute import SimpleImputer

X_cat = music_df['genre'].values.reshape(-1, 1)
X_num = music_df.drop(['popularity', 'genre'], axis=1).values
y = music_df['popularity'].values

X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2, random_state=12)

X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=12)

imp_cat = SimpleImputer(strategy='most_frequent')
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

imp_num = SimpleImputer(strategy='mean')
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.transform(X_test_num)

X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)

## Imputing with Pipeline

In [None]:
from sklearn.pipeline import Pipeline

X = music_df.drop('popularity', axis=1).values
y = music_df['popularity'].values

steps = [('imputation', SimpleImputer()), ('logistic_regression', LogisticRegression())]

pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)