**1) Simple linear regression**

a) data preparation (missing data)

In [None]:
import pandas as pd

!wget -O FuelConsumption.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%202/data/FuelConsumptionCo2.csv

df = pd.read_csv("FuelConsumption.csv")

df.isna().sum()
df['MODEL'].fillna(df['MODEL'].value_counts().idxmax(), inplace=True)
df.drop(['MAKE'], axis=1, inplace=True)
df.dropna(axis=0, inplace=True)
df['CYLINDERS'].fillna(df['CYLINDERS'].mean(), inplace=True)
df['ENGINESIZE'].interpolate('linear', inplace=True)

b) data preparation (standardization)

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

ss = StandardScaler()
cdf = df[['ENGINESIZE','CO2EMISSIONS']]
cdf_std = ss.fit_transform(cdf)
cdf_std = pd.DataFrame(cdf_std,columns=cdf.columns)
cdf_std.head()

c) linear correlation check

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

cdf = df[['ENGINESIZE','CO2EMISSIONS']]
plt.scatter(cdf.ENGINESIZE, cdf.CO2EMISSIONS,  color='blue')
plt.xlabel("Engine size")
plt.ylabel("Emission")

d) data split

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

X = np.asarray(cdf[['ENGINESIZE']])
y = np.asarray(cdf['CO2EMISSIONS'])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

e) model development

In [None]:
from sklearn import linear_model

regr = linear_model.LinearRegression()
regr.fit (X_train, y_train)
print ('Slope: ', regr.coef_)
print ('Intercept: ',regr.intercept_)

f) fit a line

In [None]:
import matplotlib.pyplot as plt

plt.scatter(X_train, y_train,  color='blue')
yhat_train = regr.predict(X_train)
plt.plot(X_train, yhat_train, '-r')
plt.xlabel("Engine size")
plt.ylabel("Emission")

g) model metrics

In [None]:
from sklearn.metrics import r2_score
import numpy as np

yhat_test = regr.predict(X_test)
print("Mean absolute error: %.2f" % np.mean(np.absolute(yhat_test - y_test)))
print("Mean sum of squares (MSE): %.2f" % np.mean((yhat_test - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_test , yhat_test) )

**2) Multiple regression**

In [None]:
import numpy as np
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

X = np.asarray(df[['ENGINESIZE','FUELCONSUMPTION_COMB','CYLINDERS']])
y = np.asarray(df['CO2EMISSIONS'])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

regr = linear_model.LinearRegression()
regr.fit (X_train, y_train)
print ('Coefficient: ', regr.coef_)
print ('Intercept: ',regr.intercept_)

yhat_test= regr.predict(X_test)
print("Mean absolute error: %.2f" % np.mean(np.absolute(yhat_test - y_test)))
print("Mean sum of squares (MSE): %.2f" % np.mean((yhat_test - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_test , yhat_test) )

a) stepwise regression

In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

lr = LinearRegression()
sfs = SFS(lr,k_features=3,forward=True, scoring='r2', cv=None)
sfs = sfs.fit(X_train, y_train)

X_train_sfs = sfs.transform(X_train)
X_test_sfs = sfs.transform(X_test)

lr.fit(X_train_sfs, y_train)
y_pred = lr.predict(X_test_sfs)

k = len(sfs.k_feature_names_)
n = X_train.shape[0]
r2 = sfs.k_score_
adj_r2 = 1 - (1 - sfs.k_score_) * ((n - 1) / (n - k - 1))

print(f'TRAIN R2: {r2}')
print(f'TRAIN ADJUSTED R2: {adj_r2}')
print(f'TEST R2: {r2_score(y_test, y_pred)}')

fig = plot_sfs(sfs.get_metric_dict(), ylabel='R^2')
plt.grid()
plt.show()

b) ridge regression

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

r = Ridge()
r.fit(X_train, y_train)
pred_results =r.predict(X_test)
print('r2 score is:', r2_score(pred_results, y_test))
print('mean squared error is:',mean_squared_error(pred_results, y_test))

**3) Multinomial regression**

a) Pair-wise correlation

In [None]:
import seaborn as sn

sn.pairplot(df)

b) model development and evaluation

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn import linear_model
import numpy as np

X = np.asarray(df[['ENGINESIZE','FUELCONSUMPTION_COMB','CYLINDERS']])
y = np.asarray(df['CO2EMISSIONS'])
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)

poly = PolynomialFeatures(degree=3)
train_x_poly = poly.fit_transform(X_train)

clf = linear_model.LinearRegression()
train_y_ = clf.fit(train_x_poly, y_train)
print ('Coefficients: ', clf.coef_)
print ('Intercept: ',clf.intercept_)

test_x_poly = poly.fit_transform(X_test)
test_y_ = clf.predict(test_x_poly)
print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - y_test)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - y_test) ** 2))
print("R2-score: %.2f" % r2_score(y_test,test_y_ ) )

**4) Non-linear regression**

a) sigmoid function

In [None]:
import numpy as np
import pandas as pd

!wget -nv -O china_gdp.csv https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-ML0101EN-SkillsNetwork/labs/Module%202/data/china_gdp.csv
df = pd.read_csv("china_gdp.csv")
plt.figure(figsize=(8,5))
x_data, y_data = (df["Year"].values, df["Value"].values)
plt.plot(x_data, y_data, 'ro')
plt.ylabel('GDP')
plt.xlabel('Year')
plt.show()

b) initial sigmoid model fit

In [None]:
import matplotlib.pyplot as plt

def sigmoid(x, Beta_1, Beta_2):
     y = 1 / (1 + np.exp(-Beta_1*(x-Beta_2)))
     return y
beta_1 = 0.10
beta_2 = 1990.0
Y_pred = sigmoid(x_data, beta_1 , beta_2)
plt.plot(x_data, Y_pred*15000000000000.)
plt.plot(x_data, y_data, 'ro')
plt.ylabel('GDP')
plt.xlabel('Year')

c) final sigmoid model fit

In [None]:
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

xdata =x_data/max(x_data)
ydata =y_data/max(y_data)
popt, pcov = curve_fit(sigmoid, xdata, ydata)
print(" beta_1 = %f, beta_2 = %f" % (popt[0], popt[1]))
x = np.linspace(1960, 2015, 55)
x = x/max(x)
plt.figure(figsize=(8,5))
y = sigmoid(x, *popt)
plt.plot(xdata, ydata, 'ro', label='data')
plt.plot(x,y, linewidth=3.0, label='fit')
plt.legend(loc='best')
plt.ylabel('GDP')
plt.xlabel('Year')
plt.show()

d) model evaluation

In [None]:
from sklearn.metrics import r2_score

msk = np.random.rand(len(df)) < 0.8
train_x = xdata[msk]
test_x = xdata[~msk]
train_y = ydata[msk]
test_y = ydata[~msk]
popt, pcov = curve_fit(sigmoid, train_x, train_y)
y_hat = sigmoid(test_x, *popt)
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_hat - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_hat - test_y) ** 2))
print("R2-score: %.2f" % r2_score(y_hat , test_y) )