# Import library and read dummy dataset

In [1]:
import pandas as pd
import numpy as np
dataframe = pd.read_csv('data.csv') # Dummy dataset containing 50 rows.

In [2]:
dataframe

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,,California,156122.51
7,130298.13,145530.06,323876.68,,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


# 5

In [3]:
dataframe.isnull().values.any() # To check dataframe for null Value

True

In [4]:
dataframe.isna().any() # Check which column contains NaN value

R&D Spend          False
Administration     False
Marketing Spend     True
State               True
Profit             False
dtype: bool

# 6

In [5]:
dataframe.columns[dataframe.isna().any()].tolist() # column name containing NaN as list

['Marketing Spend', 'State']

In [6]:
# check for categorical or Numerical data
dataframe.head() # display top 5 rows

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [7]:
dataframe.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [8]:
# as we can see in the above table that column 'Marketing Spend' contains Numerical data 
# and 'State' contains categorical data.

# 7
- Replace NaN to Most common value in case of categorical data.

In [9]:
# Value count : to know most common value in categorical data
dataframe['State'].value_counts()

California    17
Florida       14
New York      14
Name: State, dtype: int64

In [10]:
most_common = dataframe['State'].value_counts(sort=True).index[0] #get most common value
most_common

'California'

In [11]:
# Replace NaN to Most common value in case of categorical data.
dataframe.fillna({'State':most_common},inplace=True)

- Replace NaN to average of all the other values in case of Numerical data

In [12]:
# For numerical data
avg = np.mean(dataframe['Marketing Spend'])
avg

218191.2839130435

In [13]:
dataframe.fillna({'Marketing Spend':avg},inplace=True) 

# 8

In [14]:
dataframe.isnull().values.any() # no NaN exists in entire dataframe

False

In [15]:
dataframe.isna().any() # check in all columns.

R&D Spend          False
Administration     False
Marketing Spend    False
State              False
Profit             False
dtype: bool

# 9

In [16]:
dataframe.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [17]:
# y : last column (4th column) i.e profit
# dataframe.iloc[rows,columns]
# :   :- all ,eg all rows
# [3] :- 3rd column
# [:3] :-column 0,1 and 2
y = dataframe.iloc[:,[4]] 
x_categorical = dataframe.iloc[:,[3]] # categorical data in 3rd column
x_numerical = dataframe.iloc[:,:3] # Numerical data in column-0 ,column-1,column-2.

In [18]:
y.head() #display first 5 rows

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [19]:
x_categorical.head() #display first 5 rows

Unnamed: 0,State
0,New York
1,California
2,Florida
3,New York
4,Florida


In [20]:
x_numerical.head() #display first 5 rows

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# 10

In [88]:
dataframe

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,218191.283913,California,156122.51
7,130298.13,145530.06,323876.68,California,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [89]:
X = dataframe.iloc[:,:4] # splitting dataset for X value

In [90]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [91]:
X_num = X.iloc[:,:3] # splitting to numerical value
X_cat = X.iloc[:,3] # splitting to categorical value

In [92]:
X_cat.head()

0      New York
1    California
2       Florida
3      New York
4       Florida
Name: State, dtype: object

In [93]:
X_num.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


# 11

In [94]:
X_cat.value_counts()

California    22
Florida       14
New York      14
Name: State, dtype: int64

In [95]:
dataframe_cat = pd.DataFrame()

In [96]:
# converting categorical value to 1/0 encoding
dataframe_cat['California'] = np.where(X_cat=='California' ,1,0) 
dataframe_cat['New York'] = np.where(X_cat=='New York' ,1,0) 
dataframe_cat['Florida'] = np.where(X_cat=='Florida' ,1,0) 

In [97]:
dataframe_cat.head()

Unnamed: 0,California,New York,Florida
0,0,1,0
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


# 12

In [98]:
# merge X_num and dataframe_cat side by side
glued_dataframe = pd.concat([X_num, dataframe_cat], axis=1)
glued_dataframe.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,New York,Florida
0,165349.2,136897.8,471784.1,0,1,0
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,0,1
3,144372.41,118671.85,383199.62,0,1,0
4,142107.34,91391.77,366168.42,0,0,1


In [99]:
X = glued_dataframe

# 13

In [100]:
# training and test set split and model creation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

# 14

In [101]:
model.coef_ # coefficients

array([[ 7.90910038e-01,  2.96021037e-02,  2.98037742e-02,
        -1.43873763e+03,  6.45911591e+02,  7.92826035e+02]])

In [102]:
# below code was given is question
print('Model Intercept: ',model.intercept_)
coefResults = list(zip(X.columns,model.coef_[0]))
for coefResult in coefResults:
    print(str(coefResult[0]).ljust(30)," ",str(coefResult[1]).rjust(25))

Model Intercept:  [43368.59437609]
R&D Spend                               0.7909100384023781
Administration                        0.029602103683761132
Marketing Spend                        0.02980377419672228
California                              -1438.737626104836
New York                                 645.9115908083481
Florida                                  792.8260352964897


# 15

- here i am interpreting 'R&D Spend'
- 1 unit change in 'R&D Spend' will result in .7909 unit change in independent variable (y)
- i.e overall 'profit' in our dataset

# 16

In [103]:
X.corr() # finding correlation between columns

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,New York,Florida
R&D Spend,1.0,0.241955,0.746604,-0.051707,-0.003269,0.060433
Administration,0.241955,1.0,0.005887,0.09232,-0.099167,-0.002897
Marketing Spend,0.746604,0.005887,1.0,-0.207324,0.074245,0.15496
California,-0.051707,0.09232,-0.207324,1.0,-0.552771,-0.552771
New York,-0.003269,-0.099167,0.074245,-0.552771,1.0,-0.388889
Florida,0.060433,-0.002897,0.15496,-0.552771,-0.388889,1.0


In [104]:
# example. 'California' and 'R&D Spend' are negatively related 
# 'Marketing Spend' and 'R&D Spend' are highly correlated
# similarly you can find the relation in your dataset.