In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import normalize

# Membuka file csv
cars_data = pd.read_csv("autos.csv", encoding = "ISO-8859-1")
cars_data.head()

# Jawaban No. 1
cars_data.head(10)
cars_data.describe(include = 'all')
cars_data.info()
percent_null = cars_data.isnull().sum() * 100 / len(cars_data)


# Jawaban No. 2
cars_data.rename(columns = {"dateCreated": "ad_created", "dateCrawled": "date_crawled", 
                            "fuelType": "fuel_type", "lastSeen": "last_seen", 
                            "monthOfRegistration": "registration_month", 
                            "notRepairedDamage": "unrepaired_damage",
                            "nrOfPictures": "num_of_pictures", "offerType": "offer_type",
                            "postalCode": "postal_code", "powerPS": "power_ps",
                            "vehicleType": "vehicle_type",
                            "yearOfRegistration": "registration_year"}, inplace=True)
cars_data.head()


# Jawaban No. 3
cars_data['ad_created'] = pd.to_datetime(cars_data['ad_created'])
cars_data["date_crawled"] = pd.to_datetime(cars_data["date_crawled"])
cars_data["last_seen"] = pd.to_datetime(cars_data["last_seen"])

cars_data[["ad_created", "date_crawled", "last_seen"]].info()


# Jawaban No. 4
cars_data["price"] = cars_data["price"].str.replace("$","")
cars_data["price"] = cars_data["price"].str.replace(",","")
cars_data['odometer'] = cars_data['odometer'].str.replace(',','')
cars_data['odometer'] = cars_data['odometer'].str.replace('km','')

cars_data['price'] = cars_data['price'].astype(str).astype(int)
cars_data['odometer'] = cars_data['odometer'].astype(str).astype(int)

cars_data[["price", "odometer"]].info()


# Jawaban No. 5
# Melihat perbandingan data unik
for column in cars_data.columns:
     print("\n" + column)
     print(cars_data[column].value_counts())

# Drop kolom dengan perbandingan data unik yang terlalu besar
cars_data = cars_data.drop(columns=['name', 'seller', 'offer_type', 'num_of_pictures','postal_code'])
cars_data.head()


# Jawaban No. 6
cars_data['price'].value_counts()
cars_data.drop(cars_data[cars_data['price'] > 40000].index, inplace = True)
cars_data.drop(cars_data[cars_data['price'] < 500].index, inplace = True)
cars_data['price'].sort_values()


# Jawaban No. 7
# NaN di fillna dengan mode dari data
cars_data.info()
for column in ['abtest', 'vehicle_type', 'gearbox', 'model',
               'fuel_type', 'brand', 'unrepaired_damage']:
    cars_data[column].fillna(cars_data[column].mode()[0], inplace=True)

# NaN di fillna dengan median dari data
cars_data.fillna(cars_data.median())


# Jawaban No. 8 - Normalisasi menggunakan Z-Score
def z_score_standardization(series):
    return (series - series.mean()) / series.std()


for column in ['registration_year', 'power_ps',
               'odometer', 'registration_month']:
    cars_data[column] = z_score_standardization(cars_data[column])


# Jawaban No. 9 - Encoding dengan metode Dummies
# Membuat Class
class CategoricalFeatures:
    def __init__(self, df, categorical_features, encoding_type, handle_na=False):
        """
        df: pandas dataframe
        categorical_features: list of categorical column names e.g. nominal, ordinal data type
        encoding_type: type of encoding e.g. label, one_hot
        handle_na: handle the missing values or not e.g. True/False
        """
        self.df = df
        self.cat_feats = categorical_features
        self.enc_type  = encoding_type
        self.handle_na = handle_na
        self.label_encoders = dict()
        self.one_hot_encoders = None

        if self.handle_na is True:
            for c in self.cat_feats:
                self.df.loc[:, c] = self.df.loc[:, c].astype(str).fillna("-9999999")
        self.output_df = self.df.copy(deep=True)

    def _label_encoding(self):
        for c in self.cat_feats:
            lbl = preprocessing.LabelEncoder()
            lbl.fit(self.df[c].values)
            self.output_df.loc[:, c] = lbl.transform(self.df[c].values)
            self.label_encoders[c] = lbl
        return self.output_df

    def _one_hot_encoding(self):
        one_hot_encoders = preprocessing.OneHotEncoder()
        one_hot_encoders.fit(self.df[self.cat_feats].values)
        dum_ct = pd.DataFrame(one_hot_encoders.transform(self.df[self.cat_feats].values).toarray(), index = self.df.index)
        self.output_df = self.df.drop(columns=self.cat_feats, axis=1).join(dum_ct) 
        return self.output_df                        

    def _get_dummies(self):
        self.output_df = pd.get_dummies(self.df, columns=self.cat_feats, dummy_na=False)
        return self.output_df

    def fit_transform(self):
        if self.enc_type == "label":
            return self._label_encoding()
        elif self.enc_type == "one_hot":   
            return self._one_hot_encoding()
        elif self.enc_type == "get_dum":
            return self._get_dummies()
        else:
            raise Exception("Encoding type not supported!")

# Proses Encoding
encoding_cars_data = CategoricalFeatures(cars_data,
                                         ['abtest', 'vehicle_type',
                                          'gearbox', 'model',
                                          'fuel_type', 'brand',
                                          'unrepaired_damage'],
                                         'get_dum', handle_na=False)

encoding_cars_data = encoding_cars_data.fit_transform()
final_cars_data = pd.concat([cars_data, encoding_cars_data], axis = 1)
final_cars_data