In [24]:
import pandas as pd
import numpy as np

In [25]:
data = pd.read_csv('50_Startups.csv')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        48 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  49 non-null     float64
 3   State            49 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [27]:
#Find null values
data.isnull().sum()



R&D Spend          2
Administration     0
Marketing Spend    1
State              1
Profit             0
dtype: int64

In [29]:
data.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [30]:
data['R&D Spend'] = data['R&D Spend'].fillna(data['R&D Spend'].mean())

In [31]:
data['Marketing Spend'] = data['Marketing Spend'].fillna(data['Marketing Spend'].mean())

In [32]:
data['State'] = data['State'].fillna(data['State'].mode()[0])

In [33]:
#Verify null values ifany

data.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [34]:
data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,71981.472708,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,New York,149759.96


In [35]:
#Lets see what the data looks like

data.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,71981.472708,121344.6396,210234.904898,112012.6392
std,44987.814369,28017.802755,122162.595979,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,71981.472708,122699.795,210516.287449,107978.19
75%,94458.8075,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [36]:
#Lets see the unique count in State variable
data['State'].value_counts() 

New York      18
Florida       16
California    16
Name: State, dtype: int64

In [37]:
#define Features and Labels object
features = data.iloc[:,:-1].values
labels = data.iloc[:,-1].values



In [38]:
features

array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [71981.47270833333, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'New York'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [71981.47270833333, 91790.61, 210234.9048979592, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [9174

In [39]:
labels

array([192261.83, 191792.06, 191050.39, 182901.99, 166187.94, 156991.12,
       156122.51, 155752.6 , 152211.77, 149759.96, 146121.95, 144259.4 ,
       141585.52, 134307.35, 132602.65, 129917.04, 126992.93, 125370.37,
       124266.9 , 122776.86, 118474.03, 111313.02, 110352.25, 108733.99,
       108552.04, 107404.34, 105733.54, 105008.31, 103282.38, 101004.64,
        99937.59,  97483.56,  97427.84,  96778.92,  96712.8 ,  96479.51,
        90708.19,  89949.14,  81229.06,  81005.76,  78239.91,  77798.83,
        71498.49,  69758.98,  65200.33,  64926.08,  49490.75,  42559.73,
        35673.41,  14681.4 ])

In [40]:
#In order to convert categorial data into Numerical data, lets perform one hot encoding on State feature in features
#Dummy Variables
from sklearn.preprocessing import OneHotEncoder
oheState = OneHotEncoder(sparse=False)
fstate = oheState.fit_transform(features[:,3].reshape(-1,1))



In [41]:
features = np.concatenate((features[:,:-1],fstate),axis=1)

In [42]:
#Standardization

from sklearn.preprocessing import StandardScaler
scfeatures = StandardScaler()

In [43]:
#Lets standardize features 
features = scfeatures.fit_transform(features)

In [44]:
#Firstly we are going to deploy a regression model in ML so no need to stadardize label
 #Train Test split 
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(features,labels,test_size=0.2,random_state=50) 

In [45]:
print(xtrain.shape)
print(xtest.shape)


(40, 6)
(10, 6)


In [46]:
#Build the Model

from sklearn.linear_model import LinearRegression
modelR = LinearRegression()
modelR.fit(xtrain,ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [47]:
#Check the quality (whether the model is generalized or not)
print('Training Score:',modelR.score(xtrain,ytrain))
print('Test Score:',modelR.score(xtest,ytest))

Training Score: 0.9135433631989957
Test Score: 0.9376829600715654


In [67]:
#so our Test score Accuracy is 93.76%, lets consider that our required CL is 92% .Hence in this case our ML model itself satisfies the goal.
#Lets deploy the model

#Deployment


#Small heuristic for Categoriacl Variable

from sklearn.preprocessing import OneHotEncoder
oheState = OneHotEncoder(sparse=False)
Allowed_states = np.array(['California','New York','Florida'])

fST  =  oheState.fit_transform(Allowed_states.reshape(-1,1))

#1 Getting user input in the form of float
Research_spend = float(input("Enter R&D spend:"))
Adm_spend = float(input("Enter Administration spend:"))
Marketing_spend = float(input("Enter Marketing spend:"))

State=str(input("(Hint:California,New york,Florida) Enter the State:"))

#Performing tranformation for Input value
State = oheState.transform(np.array([[State]]))


#Building Arrays

features = np.array([[Research_spend,Adm_spend,Marketing_spend]])
features = np.concatenate((features,State) , axis = 0)



#Apply Standardization

features_scaled = scfeatures.fit_transform(features)

#4 Prediction
Profit = modelR.predict(features_scaled)

print('Profit Predicted is:',Profit)


Enter R&D spend:6685
Enter Administration spend:5775
Enter Marketing spend:85757
(Hint:California,New york,Florida) Enter the State:New York


ValueError: ignored