# Data Exploration

### Import

In [None]:
#import
import pandas as pd

#controls number of columns being printed
pd.set_option('max_columns', None)

#read CSV
df = pd.read_csv('', header='None')

### Set Header

In [None]:
#set headers
headers=["header_1", "header_2", "header_3"]
df.columns = headers

#get headers
print(df.columns)

### See Dataframe

In [None]:
#check first 5 rows
df.head()

#check last 5 rows
df.tail()

### Export Dataset

In [None]:
df.to_csv("automobile.csv", index=False)

### Data Types

In [None]:
#check datatypes
df.dtypes

In [None]:
#get statistical summary
df.describe()
#more advance summary
df.describe(include="all")

In [None]:
#more concise summary
df.info

# Data Wrangling

### Handle Missing Values
How to deal with missing data?
##### drop data
- drop the whole row
- drop the whole column

##### replace data
- replace it by mean
- replace it by frequency
- replace it based on other functions

In [None]:
# missing data check
missing_data = df.isnull()
missing_data.head()

#missing data counter
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

#isna.count
df.isna().sum()
    
# dropna
df.dropna(subset=["price"], axis=0, inplace=True)

#change column types then get mean
avg_norm_loss = df["normalized-losses"].astype(float).mean(axis=0)

#replace NaN
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

#to see which values are present in a particular column
df["num-of-doors"].value_counts()
#get the most frequency values showed
df["num-of-doors"].value_counts().idxmax()
df["num-of-doors"].replace(np.nan, "four", inplace=True)

#reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [None]:
#changing types
df[["bore", "stroke"]] = df[["bore", "stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")

#last check
df.dtypes

In [None]:
#rename cplumn
df["highway-mpg"] = 235/df["highway-mpg"]
df.rename(columns={'"highway-mpg"': 'highway-L/100km'}, inplace=True)
df.head()

### Data Normalization

In [None]:
#Simple Feature Scaling
df["length"] = df["length"]/df["length"].max()
#Min Max Feature Scaling
df["length"] = (df["length"]-df["length"].min())/(df["length"].max()-df["length"].min())
#Z-Score
df["length"] = (df["length"]-df["length"].mean())/df["length"].std()

### Binning

In [None]:
#Apply Histogram Code
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
plt.pyplot.hist(df["price"])

# set x/y labels and plot title
plt.pyplot.xlabel("price")
plt.pyplot.ylabel("count")
plt.pyplot.title("price bins")

In [None]:
# grouping values
bins = np.linspace(min(df["price"]), max(df["price"]), 4)
group_names = ["Low", "Medium", "High"]
df["price-binned"] = pd.cut(df["price"], bins, labels=group_names, include_lowest=True)
df[["price", "price-binned"]].head()
#apply histogram but with the binned version

### One Hot Encoding

In [None]:
#convert to 0,1 values
dummy_variable_1 = pd.get_dummies(df['fuel'])
dummy_variable_1.rename(columns={'fuel-type-diesel':'gas', 'fuel-type-diesel':'diesel'}, inplace=True)
dummy_variable_1.head()

# merge data frame "df" and "dummy_variable_1" 
df = pd.concat([df, dummy_variable_1], axis=1)

# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)

# Exploratory Data Analysis

In [None]:
# get fast statistic summary
df.describe()

# get count number
drive_wheels_counts = df["drive-wheels"].value_counts()

In [None]:
#DESCRIPTIVE ANALYSIS
#1. boxplot to see distribution and outlier
#2. scatter plot to see relationship between 2 variables (predictor, target)

In [None]:
#GROUPING DATA
df_test = df[['drive-wheels','body-style','price']]
df_grp = df_test.groupby(['drive-wheels', 'body-style'], as_index=False).mean()
df_grp

In [None]:
#PIVOT
df_pivot = df_grp.pivot(index='drive-wheels', columns='body-style')

<img src="./pandas_pivot1.png">

In [None]:
# install
%%capture
! pip install seaborn

#import
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
#Heatmap = see relationship in multiple variables
plt.pcolor(df_pivot, cmap='RdBu')
plt.colorbar()
plt.show()

<img src="./heatmap1.png">

In [None]:
# advanced
fig, ax = plt.subplots()
im = ax.pcolor(grouped_pivot, cmap='RdBu')

#label names
row_labels = grouped_pivot.columns.levels[1]
col_labels = grouped_pivot.index

#move ticks and labels to the center
ax.set_xticks(np.arange(grouped_pivot.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(grouped_pivot.shape[0]) + 0.5, minor=False)

#insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)

#rotate label if too long
plt.xticks(rotation=90)

fig.colorbar(im)
plt.show()

<img src="./heatmap2.png">

In [None]:
#ANOVA(Analysis of Variance) = finding correlation between different groups of a categorical variable
#ex. Average price of different vehicle makes.
#returns:
#1. F-test score = calculates the reatio of variation within each of the sample group means. Bigger score = highly correlate
#2. p-value > 0.05 means null hyphoteses is not accepted. Score < 0.5 = good
df_anova = df[["make","price"]]
grouped_anova = df_anova.groupby(["make"])

#anova components:
#1. get group => to get values of the method group
grouped_anova.get_group('subaru')['price']
#2. f_oneway => get f-test score and p-value
anova_results_1 = stats.f_oneway(grouped_anova.get_group("honda")["price"], grouped_anova.get_group("subaru")["price"], grouped_anova.get_group("mercedes")["price"])

In [None]:
#Correlation = measure to what extent different variables are interdependent
#Correlation doens't imply causation
#ex:
#1. Lung cancer -> smoking
#2. rain -> umbrella
#returns: positive, negative, weak, strong, no correlation

#THIS IS FOR NUMERICAL VARIABLES
sns.regplot(x="engine-size", y="price", data=df)
plt.ylim(0,)

# seek correlation value after visualisation:
df[['feature','target']].corr()

In [None]:
#THIS IS FOR CATEGORICAL VARIABLES (object/int data types allowed)
#use boxplot. Prevent overlapping boxes
sns.boxplot(x="body-style", y="price", data=df)

#describe for categorical
df.describe(include=['object'])

#check how many units of each variable we have. Note: we are not using double bracket, value_counts works for pandas series (not pandas df)
engine_loc_counts = df['engine-location'].value_counts().to_frame()
engine_loc_counts.rename(columns={'engine-location': 'value_counts'}, inplace=True)
engine_loc_counts.index.name = 'engine-location'
engine_loc_counts.head(10)

<img src="./value_count.png">

In [None]:
#Another correlation implementation: Pearson Correlation
#Aim: measure the strength of the correlation between two features.
#consists of:
#1. correlation coefficient: linearity test. (+1) strong positive relationship; (-1) strong negative relationship; (0) no relationship
#2. p-value: statistical significance test. The smaller the better (threshold < 0.05) else (>0.1) no correlation.

from scipy import stats

pearson_coef, p_value = stats.pearsonr(df['housepower'], df['price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)

### Basic of Grouping

In [None]:
#see different categories
df['drive-wheels'].unique()

#assign to variables
df_group_one = df[['drive-wheels','body-style','price']]

#grouping results
df_group_one = df_group_one.groupby(['drive-wheels'], as_index=False).mean()
df_group_one

<img src="./groupby.png">

In [None]:
# grouping multiple variables is also alllowed
df_gptest = df[['drive-wheels','body-style','price']]
grouped_test1 = df_gptest.groupby(['drive-wheels','body-style'],as_index=False).mean()
grouped_test1

<img src="./groupby2.png">

In [None]:
# ALTERNATIVES, you'll never be wrong with pivot tables
grouped_pivot = grouped_test1.pivot(index='drive-wheels',columns='body-style')
grouped_pivot

<img src="./pivot.png">

In [None]:
# to fill the missing values with 0
grouped_pivot = grouped_pivot.fillna(0)
grouped_pivot

# Model Development
y= b0 + b1x
- y = target / dependent variable
- x = predictor / independent variable
- b0 = intercept
- b1 = slope

noise = small random value added 