### Decision Tree, Random Forest, Linear Regression Problem 

In [1]:
# Libraries to process file and visualization for EDA
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
sns.set(style='white', context='notebook')

### Import Dataset

In [12]:
# Import dataset file and get panda dataframe. Also copy original dataframe to future reference.
dataset = pd.read_csv('50_Startups.csv')
dataset_original = dataset

In [13]:
# Check few columns and related data values 
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [14]:
# Check all available columns
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [15]:
# Check all column datatypes also observe Non-Null values

dataset.info()

# Observation : There are total 50 records available. 
# Other than 'State' column all columns are numerical having datatype float64 and We can see there are 
# no missing values in dataset

# Also all columns are significant for our analysis, so we are not droping any column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [16]:
# Check 5 number summary
dataset.describe().T

# If MEAN and MEDIAN difference is more than 10% then there are possible outliers present in data.
# Observation : As there is not much MEAN and MEDIAN difference for each column so there are less or no outliers

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing Spend,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


### Checking missing values

In [17]:
# part 1 : Handling missing value
missing_values = dataset.isnull().sum()
missing_value_percentages = missing_values * 100 / len(dataset)

missing_value_df = pd.DataFrame(data=[missing_values, missing_value_percentages], index=["Total", "%"]).T
missing_value_df

# Observation : Here We do not have any missing values in dataset.

Unnamed: 0,Total,%
R&D Spend,0.0,0.0
Administration,0.0,0.0
Marketing Spend,0.0,0.0
State,0.0,0.0
Profit,0.0,0.0


In [18]:
dataset.head()
# states = dataset['State'].value_counts()
# print(states,'\n')

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

dataset['State'] = encoder.fit_transform(dataset['State'])

In [21]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [None]:
# We have 'State' column of type object and it is also considered significant for model training, 
# so we need to encode the column

# part 2 : Encoding 
# Label encoder

states = dataset['State'].value_counts()
print(states,'\n')

dataset['State'] = dataset['State'].astype('category')
dataset['State'] = dataset['State'].cat.codes
      
print(dataset.head())
# Observation : We have 3 unique states New York, California and Florida.
# After label encoding dummy columns State_0  State_1  State_2 are created with numeric datatypes  

In [12]:
# dataset.info()

In [13]:
# OneHot encoding
dataset = pd.get_dummies(dataset, columns=['State'])
print(dataset.head())

# After OneHot encoding remove first column suffix with _0 from generated dummy columns to avoid multicollinearity
# Dummy Variables : n-1
# dataset = dataset.drop(['State_0'], axis=1)

dataset.info()

# Observation : After label encoding dummy columns State_0  State_1  State_2 are created with numeric datatypes 
# State column data is encoded now and column type of dummu columns are numeric

In [14]:
# dataset['State_0']

In [15]:
# x = dataset.drop(columns=['Profit'], axis=1)
# print('x\n ',x.head())
# y = dataset['Profit']
# print('\ny\n ',y.head())