# Introduction to Scikit-Learn

Datasets on Scikit-Learn: https://scikit-learn.org/stable/datasets/index.html

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn import datasets
boston = datasets.load_boston()

In [3]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [4]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [18]:
import statsmodels.api as sm
Y = boston['target']
#X = boston['data']
X = pd.DataFrame(boston['data'], columns = boston['feature_names'])
X = sm.add_constant(X)
model = sm.OLS(Y,X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,108.1
Date:,"Thu, 17 Dec 2020",Prob (F-statistic):,6.72e-135
Time:,20:14:53,Log-Likelihood:,-1498.8
No. Observations:,506,AIC:,3026.0
Df Residuals:,492,BIC:,3085.0
Df Model:,13,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,36.4595,5.103,7.144,0.000,26.432,46.487
CRIM,-0.1080,0.033,-3.287,0.001,-0.173,-0.043
ZN,0.0464,0.014,3.382,0.001,0.019,0.073
INDUS,0.0206,0.061,0.334,0.738,-0.100,0.141
CHAS,2.6867,0.862,3.118,0.002,0.994,4.380
NOX,-17.7666,3.820,-4.651,0.000,-25.272,-10.262
RM,3.8099,0.418,9.116,0.000,2.989,4.631
AGE,0.0007,0.013,0.052,0.958,-0.025,0.027
DIS,-1.4756,0.199,-7.398,0.000,-1.867,-1.084

0,1,2,3
Omnibus:,178.041,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,783.126
Skew:,1.521,Prob(JB):,8.84e-171
Kurtosis:,8.281,Cond. No.,15100.0


In [19]:
X.shape

(506, 14)

In [20]:
Y.shape

(506,)

In [6]:
from sklearn import linear_model
#help(linear_model)

In [7]:
model = linear_model.LinearRegression() #fit_intercept, default olarak True ve kendisi constant ekliyor

In [14]:
model.fit(X, Y)
print(model.intercept_) #coefficient of constant
print(model.coef_) 
#scientific notation eger - ise en bastaki rakam bir saga gidiyor eger + ise en bastaki rakamdan sonraki rakam öne gidiyor
print(round(model.coef_[5],4)) #4 burada decimal

36.4594883850901
[ 0.00000000e+00 -1.08011358e-01  4.64204584e-02  2.05586264e-02
  2.68673382e+00 -1.77666112e+01  3.80986521e+00  6.92224640e-04
 -1.47556685e+00  3.06049479e-01 -1.23345939e-02 -9.52747232e-01
  9.31168327e-03 -5.24758378e-01]
-17.7666


In [9]:
model.score(X,Y) #R-square

0.7406426641094094

In [10]:
from sklearn.metrics import r2_score
predictions = model.predict(X)
r2_score(Y, predictions)

0.7406426641094094

In [15]:
pd.DataFrame({"Actual":Y,"Predictions":predictions})

Unnamed: 0,Actual,Predictions
0,24.0,30.003843
1,21.6,25.025562
2,34.7,30.567597
3,33.4,28.607036
4,36.2,27.943524
...,...,...
501,22.4,23.533341
502,20.6,22.375719
503,23.9,27.627426
504,22.0,26.127967


In [16]:
# there is an algorithm that does feature elimination
# number of features desired:5 I would like to have 5 features

#### Sidenote: Using scipy.stats for regression (supports only Simple Linear Regression)

In [17]:
from scipy import stats #only 1 X variable and 1 Y variable
X_scipy = X['CRIM']
slope, intercept, r_value, p_value, std_err = stats.linregress(Y, X_scipy)
print(slope, intercept, r_value, p_value, std_err)

-0.3631599222576031 11.796535750221913 -0.38830460858681165 1.1739870821943443e-19 0.03839017467422353
