# Day 1

# Intro to Python libraries

https://www.projectpro.io/article/top-5-libraries-for-data-science-in-python/196

In [1]:
import pandas as pd  # For loading data into a tabular format
import numpy as np   # For manipulating data
import matplotlib.pyplot as plt  # For graph plotting
import seaborn as sns  # For graph plotting
from sklearn.preprocessing import LabelEncoder  # For label encoding in EDA
from sklearn.utils import resample   # For data sampling in EDA
from sklearn.utils import shuffle   # For shuffling the data

In [3]:
# Load the data 
housing = pd.read_csv('../input/california-housing-prices/housing.csv')

In [4]:
# Print the data
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


https://developers.google.com/machine-learning/crash-course/california-housing-data-description

# Data Visualization

In [None]:
# Create scatterplot with pyplot
scatter_plt = plt.scatter(x=housing['longitude'], y=housing['latitude'])
plt.show()

In [None]:
# Create scatterplot with seaborn
# Usually, seaborn is for creating plot, but plt (matplotlib) is usually for setting up visuals,
# and texts
scatter_sns_1 = sns.scatterplot(x=housing['longitude'], y=housing['latitude'])
plt.show()

In [None]:
# Create scatterplot with seaborn + add HUE parameter
#What is 'ocean_proximity'? Hue in the matter of design colors? In the table, the ocean_proximity
#is like a simple string, so how it can show map colors?
scatter_sns_2 = sns.scatterplot(x='longitude', y='latitude', data=housing, hue='ocean_proximity')
scattered = sns.

plt.show()

In [None]:
# Add title above the scatterplot
scatter_sns_2 = sns.scatterplot(x='longitude', y='latitude', data=housing, hue='ocean_proximity')
plt.title('California Housing Geography')
plt.show()

In [None]:
# Create boxplot with seaborn
boxplot_sns = sns.boxplot(x='housing_median_age', data=housing)
plt.show()

In [None]:
# Create boxplot with seaborn
boxplot_sns = sns.boxplot(x='population', data=housing)
plt.show()

In [None]:
# Calculate the number of data per its ocean_proximity 
data_barplot = housing['ocean_proximity'].value_counts()

# Print data
data_barplot

In [None]:
# Create barplot / barchart with pyplot
data_barplot.sort_values().plot(kind="bar", color=['red', 'green','black','blue','orange'])
plt.show()

In [None]:
# Create barplot with seaborn
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)

In [None]:
# Change the figure size
plt.figure(figsize=(8, 6))
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)
plt.show()        

In [None]:
# Add annotation to the barplot
plt.figure(figsize=(10, 6))
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)

for i in range(len(data_barplot)):
    #Parameters: ([index], [Y-axis offset (+100 pt)], [value], [alignment])
    plt.text(i, (data_barplot.values[i] + 100), data_barplot.values[i], horizontalalignment='center')

plt.show()

In [None]:
# Graph figure
plt.figure(figsize=(8, 6))

#Create graph instance
barplot = sns.barplot(y=data_barplot.values, x=data_barplot.index)

#Create text on top of each bar
for i in range(len(data_barplot)):
    plt.text(i, (data_barplot.values[i] + 100), data_barplot.values[i], horizontalalignment='right')

#Create title, Y-axis label, or X-label
plt.title('Ocean proximity counts')
plt.ylabel('Counts')             
plt.xlabel('Ocean proximity')

#Seaborn can also set titles, but usually its plt?
#barplot.set(xlabel='seaborn_count', ylabel='ocean_proximity_seaborn')

plt.show()

# Preprocessing Data for EDA and EDA

**Goals:**
1. Classification => ocean proximity
2. Regression => median housing value

**Flow:**
1. ?Downsampling
2. Find null values and fill it with (Number => mean / median) & (Other => mod type / other new type), 
3. Find duplicates and remove them
1. ?Downsampling 
4. Check for correlation (for Regression Task)
5. Transform latest DataFrame into some 'features', so that they can be used easily as predictive model (ML)
6. Encode the DataFrame into more readable form (opt.)

**Downsampling Steps on Single Column**:
1. Exclude the outliers (most different data, but with very small count)
2. Transform the column types into dictionary (key, value) = (type_name, count)
3. Get the column's type with minimal count as anchor (key and value)
4. Perform downsampling for reducing and flattening data size into the same as minimal type's count
   - Use resample() method as coded below
5. Consider other columns to be downsampled

In [None]:
# Exclude ISLAND in ocean_proximity
# housing[... != 'ISLAND'] => filtering, because != returns T/F from all ocean_proximity => get()
housing_filtered = housing[housing['ocean_proximity']!='ISLAND']

# Keep ocean_proximity and its number of data as dictionary (OBJECT / MAP)
ocean_prox_dict = dict(housing_filtered['ocean_proximity'].value_counts())

ocean_prox_dict

In [None]:
##RETRIEVING MIN VALUE AND ITS KEY OF OCEAN_PROX_DICT
#Get key from the min value of ocean_prox_dict
def get_min_key(ocean_prox_dict, min_value):
    min_key = ''
    for key,value in ocean_prox_dict.items():
        if value == min_value:
            min_key = key
    return min_key

# Get the ocean_proximity with the lowest number of data
min_value = min(ocean_prox_dict.values())
min_key = get_min_key(ocean_prox_dict, min_value)

min_key, min_value

# min_key = [k for k, v in ocean_prox_dict.items() if v == min_value]
# min_key

In [None]:
# Perform downsampling to neutralize the number of data for each label
# Downsampling is a process for reducing 'redundant' datas
# Downsampling is done by reducing EACH COLUMN COUNT (housing_filtered) into their sibling with MINIMUM COUNT (NEAR_BAY = 2290)
# So that all 'ocean_proximity' type (ex: IN LAND, etc.)'s count will be => 2290
# Downsampling enables you to create even smaller models since the machine learning algorithm doesn't require as many training data points

def downsampling(df, target_column, min_key, min_value):
    #Assign a DataFrame with 'ocean_proximity' type with min counts
    #This DataFrame will be added with other 'ocean_proximity' types after they has been resampled (downsampling)
    df_process = df[df[target_column] == min_key]
    
    print(df[target_column])
    for target in df[target_column].unique():
        #If target is minimum key (NEAR_BAY), no downsample will be executed
        if target != min_key:
            #Will be filled with 2290 (n_samples = min_value) datas with each 'ocean_proximity' types
            class_downsample = resample(df[df[target_column] == target], replace = False, n_samples = min_value, random_state = 24)
            
            #Concat the current DataFrame (df_process) with the new class_downsample (adds 2290 datas)
            df_process = pd.concat([df_process, class_downsample])
    
    #Reset index and shuffle the finalized DataFrame(df_process)
    return shuffle(df_process).reset_index(drop=True)

housing_downsampled = downsampling(housing_filtered, 'ocean_proximity', min_key, min_value)

In [None]:
# Check for null values, and fill the missing datas
# Missing datas is usually filled with *mean* or *median*
    # Many outliers => median (so that missing value won't be so big), otherwise => mean
    # Outliers is big or small data that far different than most datas, but with small count (1 2 3 10000 (outlier))
housing_downsampled.info()

#In the output below, notice that 'total_bedrooms' GOT 9080 non-null counts, compared to other (9160), meaning there are some nulls to be FILLED

In [None]:
# Calculate total_bedrooms mean
housing_downsampled['total_bedrooms'].mean()

In [None]:
# Calculate total_bedrooms median
housing_downsampled['total_bedrooms'].median()

Note: To fill empty values in numerical attributes, mean and median are widely used. When the attribute has many outliers, median is recommended.

In [None]:
# Fill null values on new duplicate variable (so that the source variable stays clean)
housing_fillNaN = housing_downsampled.copy()

#Fill null values in 'total_bedrooms' column with the column's median via .fillna() method 
housing_fillNaN['total_bedrooms'] = housing_fillNaN['total_bedrooms'].fillna(housing_fillNaN['total_bedrooms'].median())

In [None]:
# Print dataframe info (AGAIN) AFTER .fillna() to check non-null count and dtype 
housing_fillNaN.info()

#In the output below, NOTICE THAT the non-null count of 'total_bedrooms' is now: 9160

In [None]:
# Check for duplicates
# Search the copy var with [...] where .duplicated() will return T/F for each value
# The code below will return one or more row if there are duplicates
# ML Models usually needs no duplicate

housing_fillNaN[housing_fillNaN.duplicated()]

# Coincidentally, no duplicates or .duplicated() method checks more than one column

In [None]:
# Check for median_house_value correlation (Regression Task)
# Regression => 'median_house_value' as anchor
    # Check for a colleration towards 'median_house_value'
    # The anchor (median_house_value) is obviously *1.0*, because it's tightly correlated

corrDatas = housing_fillNaN.corr()['median_house_value']
corrPlot = sns.scatterplot(data = corrPlot)

In [None]:
# Feature engineering
# Feature engineering is the process that takes raw data and transforms it into features that can be used to
# create a predictive model using machine learning or statistical modeling, such as deep learning.

housing_fillNaN['population_per_households'] = round(housing_fillNaN['population'] / housing_fillNaN['households'])

In [None]:
housing_fillNaN['population_per_bedrooms'] = round(housing_fillNaN['population'] / housing_fillNaN['total_bedrooms'])

In [None]:
# Label ocean_proximity to numerical representation
le = LabelEncoder()
housing_fillNaN['ocean_proximity_num'] = le.fit_transform(housing_fillNaN['ocean_proximity'])
