# Data Exploration

### Import

In [None]:
#import
import pandas as pd

#controls number of columns being printed
pd.set_option('max_columns', None)

#read CSV
df = pd.read_csv('', header='None')

### Set Header

In [None]:
#set headers
headers=["header_1", "header_2", "header_3"]
df.columns = headers

#get headers
print(df.columns)

### See Dataframe

In [None]:
#check first 5 rows
df.head()

#check last 5 rows
df.tail()

### Export Dataset

In [None]:
df.to_csv("automobile.csv", index=False)

### Data Types

In [None]:
#check datatypes
df.dtypes

In [None]:
#get statistical summary
df.describe()
#more advance summary
df.describe(include="all")

In [None]:
#more concise summary
df.info

# Data Wrangling

### Handle Missing Values
How to deal with missing data?
##### drop data
- drop the whole row
- drop the whole column

##### replace data
- replace it by mean
- replace it by frequency
- replace it based on other functions

In [None]:
# missing data check
missing_data = df.isnull()
missing_data.head()

#missing data counter
for column in missing_data.columns.values.tolist():
    print(column)
    print (missing_data[column].value_counts())
    print("")  

#isna.count
df.isna().sum()
    
# dropna
df.dropna(subset=["price"], axis=0, inplace=True)

#change column types then get mean
avg_norm_loss = df["normalized-losses"].astype(float).mean(axis=0)

#replace NaN
df["normalized-losses"].replace(np.nan, avg_norm_loss, inplace=True)

#to see which values are present in a particular column
df["num-of-doors"].value_counts()
#get the most frequency values showed
df["num-of-doors"].value_counts().idxmax()
df["num-of-doors"].replace(np.nan, "four", inplace=True)

#reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [None]:
#changing types
df[["bore", "stroke"]] = df[["bore", "stroke"]].astype("float")
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
df[["price"]] = df[["price"]].astype("float")
df[["peak-rpm"]] = df[["peak-rpm"]].astype("float")

#last check
df.dtypes

In [None]:
#rename cplumn
df["highway-mpg"] = 235/df["highway-mpg"]
df.rename(columns={'"highway-mpg"': 'highway-L/100km'}, inplace=True)
df.head()

### Data Normalization

In [None]:
#Simple Feature Scaling
df["length"] = df["length"]/df["length"].max()
#Min Max Feature Scaling
df["length"] = (df["length"]-df["length"].min())/(df["length"].max()-df["length"].min())
#Z-Score
df["length"] = (df["length"]-df["length"].mean())/df["length"].std()

### Binning

In [None]:
#Apply Histogram Code
%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot
plt.pyplot.hist(df["price"])

# set x/y labels and plot title
plt.pyplot.xlabel("price")
plt.pyplot.ylabel("count")
plt.pyplot.title("price bins")

In [None]:
# grouping values
bins = np.linspace(min(df["price"]), max(df["price"]), 4)
group_names = ["Low", "Medium", "High"]
df["price-binned"] = pd.cut(df["price"], bins, labels=group_names, include_lowest=True)
df[["price", "price-binned"]].head()
#apply histogram but with the binned version

### One Hot Encoding

In [None]:
#convert to 0,1 values
dummy_variable_1 = pd.get_dummies(df['fuel'])
dummy_variable_1.rename(columns={'fuel-type-diesel':'gas', 'fuel-type-diesel':'diesel'}, inplace=True)
dummy_variable_1.head()

# merge data frame "df" and "dummy_variable_1" 
df = pd.concat([df, dummy_variable_1], axis=1)

# drop original column "fuel-type" from "df"
df.drop("fuel-type", axis = 1, inplace=True)

###