# ---------------------------------  Linear Regression ---------------------------------------

# 1. Reading the Wine dataset

In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
wine= pd.read_csv('Wine_student.csv')

# 2. Exploring the dataset

In [3]:
wine.shape

(1599, 12)

In [4]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
wine.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

Insights: All independent variables are numeric.

# 3. preaparing the data for modelling


# 3.1 Checking for any null values ans dealing with them

In [6]:
sum(wine.isnull().sum())

39

Insights: since only total 39 null values are present in the dataset, we can drop them, as it wont cause significant data lose.

In [7]:
wine=wine.dropna()
print(sum(wine.isnull().sum()))
print(wine.shape)

0
(1562, 12)


# 3.2 Preaparing Inputs and Output

In [8]:
target=wine[['quality']]
features=wine.drop(['quality'],axis=1)

In [9]:
print(target.shape)
print(features.shape)

(1562, 1)
(1562, 11)


# 3.3 Preparing the training and testing sets 

In [10]:
X_train, X_test, y_train, y_test = train_test_split( features, target, test_size=0.30, random_state=42)

In [11]:
print(X_train.shape)
print(X_test.shape)

(1093, 11)
(469, 11)


# 4. Modelling

# 4.1 Training(fitting linear regression on the train data)

In [12]:
lm=linear_model.LinearRegression()
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
print('Coefficients: \n', lm.coef_)

Coefficients: 
 [[ 4.97216525e-02 -1.15340341e+00 -1.98232750e-01  4.07354268e-02
  -1.88150196e+00  4.25115869e-03 -2.96443486e-03 -3.46339864e+01
  -3.15136155e-01  8.69840479e-01  2.77242821e-01]]


In [14]:
print(lm.intercept_)

[38.11606028]


# 4.2 Testing/Predicting on the test data

In [15]:
y_pred=lm.predict(X_test)

# 4.3 Performance evaluation of the model using RMSE

In [16]:
from sklearn.metrics import mean_squared_error
import math 


rmse= math.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

0.6494032431478547


Conclusion: Our Linear regression model has a error rate of 0.65.