# Preprocessing


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

sales_df = pd.read_csv("datasets/advertising_and_sales_clean.csv")
music_df = pd.read_csv("datasets/music_clean.csv")

print(sales_df)

           tv     radio  social_media influencer      sales
0     16000.0   6566.23       2907.98       Mega   54732.76
1     13000.0   9237.76       2409.57       Mega   46677.90
2     41000.0  15886.45       2913.41       Mega  150177.83
3     83000.0  30020.03       6922.30       Mega  298246.34
4     15000.0   8437.41       1406.00      Micro   56594.18
...       ...       ...           ...        ...        ...
4541  26000.0   4472.36        717.09      Micro   94685.87
4542  71000.0  20610.69       6545.57       Nano  249101.92
4543  44000.0  19800.07       5096.19      Micro  163631.46
4544  71000.0  17534.64       1940.87      Macro  253610.41
4545  42000.0  15966.69       5046.55      Micro  148202.41

[4546 rows x 5 columns]


# Create dummy variables

#### Only applicable if one column has categorical data


In [None]:
sales_dummy = pd.get_dummies(sales_df, drop_first=True)
print(sales_dummy.head())

#### If more than one col of categorical data we will drop and concatinate


In [None]:
sales_dummy = pd.get_dummies(sales_df["influencer"], drop_first=True)
print(sales_dummy.head())
print("------------------------------------------------------")

sales_dummy = pd.concat([sales_df, sales_dummy], axis=1)
sales_dummy.drop("influencer", axis=1, inplace=True)
print(sales_dummy.head())

## Binarizing column

In [None]:
# Convert category to a binary feature
sales_df["influencer"] = np.where(sales_df["influencer"] == 'Mega', 1, 0)
print(sales_df)

# Handling missing data

In [None]:
print(music_df.isna().sum().sort_values(ascending=False))

## Exclude categorical data

In [3]:
X = sales_df.select_dtypes(exclude=['object'])
print(X.head())

        tv     radio  social_media      sales
0  16000.0   6566.23       2907.98   54732.76
1  13000.0   9237.76       2409.57   46677.90
2  41000.0  15886.45       2913.41  150177.83
3  83000.0  30020.03       6922.30  298246.34
4  15000.0   8437.41       1406.00   56594.18


## Drop missing values

In [None]:
# drop all missing
music_df_clean = music_df.dropna()

#or subset
#music_df_clean = music_df.dropna(subset=['valence'])

print(music_df_clean.isna().sum().sort_values(ascending=False))

## Impute missing values (Fill missing data)
#### Must be split into numerical and categorical data
- Fill with **Mean** for numerical data
- Fill with **Mode** for categorical data

In [None]:
from sklearn.impute import SimpleImputer

X_cat = music_df['genre'].values.reshape(-1, 1)
X_num = music_df.drop(['popularity', 'genre'], axis=1).values
y = music_df['popularity'].values

X_train_cat, X_test_cat, y_train, y_test = train_test_split(X_cat, y, test_size=0.2, random_state=12)

X_train_num, X_test_num, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=12)

imp_cat = SimpleImputer(strategy='most_frequent')
X_train_cat = imp_cat.fit_transform(X_train_cat)
X_test_cat = imp_cat.transform(X_test_cat)

imp_num = SimpleImputer(strategy='mean')
X_train_num = imp_num.fit_transform(X_train_num)
X_test_num = imp_num.transform(X_test_num)

X_train = np.append(X_train_num, X_train_cat, axis=1)
X_test = np.append(X_test_num, X_test_cat, axis=1)

## Imputing with Pipeline

In [None]:
from sklearn.pipeline import Pipeline

X = music_df.drop('popularity', axis=1).values
y = music_df['popularity'].values

steps = [('imputation', SimpleImputer()), ('logistic_regression', LogisticRegression())]

pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)