<a href="https://colab.research.google.com/github/amiya559/GreatLearningUseCases/blob/master/DataScienceWithPythonLinearRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
import scipy.stats as stats

"""
Data set link: http://archive.ics.uci.edu/ml/datasets/Automobile

Data Set Information:

This data set consists of three types of entities: (a) the specification of an auto in terms of various 
characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars. 
The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are 
initially assigned a risk factor symbol associated with its price. Then, if it is more risky (or less), this symbol 
is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates 
that the auto is risky, -3 that it is probably pretty safe. 

The third factor is the relative average loss payment per insured vehicle year. This value is normalized for all 
autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), 
and represents the average loss per car per year. 

Note: Several of the attributes in the database could be used as a "class" attribute.


Attribute Information:

Attribute: Attribute Range

1. symboling: -3, -2, -1, 0, 1, 2, 3.
2. normalized-losses: continuous from 65 to 256.
3. make:
alfa-romero, audi, bmw, chevrolet, dodge, honda,
isuzu, jaguar, mazda, mercedes-benz, mercury,
mitsubishi, nissan, peugot, plymouth, porsche,
renault, saab, subaru, toyota, volkswagen, volvo

4. fuel-type: diesel, gas.
5. aspiration: std, turbo.
6. num-of-doors: four, two.
7. body-style: hardtop, wagon, sedan, hatchback, convertible.
8. drive-wheels: 4wd, fwd, rwd.
9. engine-location: front, rear.
10. wheel-base: continuous from 86.6 120.9.
11. length: continuous from 141.1 to 208.1.
12. width: continuous from 60.3 to 72.3.
13. height: continuous from 47.8 to 59.8.
14. curb-weight: continuous from 1488 to 4066.
15. engine-type: dohc, dohcv, l, ohc, ohcf, ohcv, rotor.
16. num-of-cylinders: eight, five, four, six, three, twelve, two.
17. engine-size: continuous from 61 to 326.
18. fuel-system: 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.
19. bore: continuous from 2.54 to 3.94.
20. stroke: continuous from 2.07 to 4.17.
21. compression-ratio: continuous from 7 to 23.
22. horsepower: continuous from 48 to 288.
23. peak-rpm: continuous from 4150 to 6600.
24. city-mpg: continuous from 13 to 49.
25. highway-mpg: continuous from 16 to 54.
26. price: continuous from 5118 to 45400.

"""

car_df = pd.read_csv("imports-85.data", names=['symbolic', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
                                               'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
                                               'wheel-base', 'length', 'width', 'height', 'curb-weight',
                                               'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system',
                                               'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm',
                                               'city-mpg', 'highway-mpg', 'price'])

# print(car_df.dtypes)
"""
Output:
symbolic              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system:          object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

Many of them are object type that mean string type. ML algorithm can't handle string data type.
So we have to convert these data type into numbers

Before doing this let's first drop some column which has no effect in predicting the price.

"""
car_df = car_df.drop('make', axis=1)  # Dropping here make but in real project, may separate data based on make

# Dropping following columns due to low variance filter. i.e an attribute which is mostly one type of data is not a
# good dimension
car_df = car_df.drop('fuel-type', axis=1)
car_df = car_df.drop('engine-location', axis=1)
car_df = car_df.drop('num-of-doors', axis=1)
car_df = car_df.drop('body-style', axis=1)
car_df = car_df.drop('drive-wheels', axis=1)
car_df = car_df.drop('engine-type', axis=1)
car_df = car_df.drop('fuel-system', axis=1)
car_df = car_df.drop('aspiration', axis=1)
car_df = car_df.drop('normalized-losses', axis=1)

# print(car_df.dtypes)
# After dropping some column let's convert object into numbers

# Replace the string numbers into numerical values for number of cylinders
car_df['num-of-cylinders'] = car_df['num-of-cylinders'].replace(
    {
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10,
        "eleven": 11,
        'twelve': 12
    }
)

# Replace ? with NA
car_df = car_df.replace('?', np.nan)
# print(car_df[car_df.isnull().any(axis=1)]) # Display record with NA


# Change the attribute type from object to float type
car_df['bore'] = car_df['bore'].astype('float64')
car_df['stroke'] = car_df['stroke'].astype('float64')
car_df['horsepower'] = car_df['horsepower'].astype('float64')
car_df['peak-rpm'] = car_df['peak-rpm'].astype('float64')
car_df['price'] = car_df['price'].astype('float64')

# print(car_df.dtypes)

# fill up NaN in numeric columns with median values of those column respectively
car_df['price'] = car_df['price'].fillna(car_df['price'].median())
# median because of bad outliers in case of mean and mode is for categorical value
car_df['bore'] = car_df['bore'].fillna(car_df['bore'].median())
car_df['stroke'] = car_df['stroke'].fillna(car_df['stroke'].median())
car_df['horsepower'] = car_df['horsepower'].fillna(car_df['horsepower'].median())
car_df['peak-rpm'] = car_df['peak-rpm'].fillna(car_df['peak-rpm'].median())
car_df['num-of-cylinders'] = car_df['num-of-cylinders'].fillna(car_df['num-of-cylinders'].median())

"""df = car_df.describe().transpose()
print(df)
"""

"""sns.pairplot(car_df, diag_kind='kde')
plt.show()

"""
# Drop price since it's a dependent variable
X = car_df.drop('price', axis=1)
# Copy price alone into Y data frame
Y = car_df[['price']]

# Let us break X and Y DF into training set and test set
from sklearn.model_selection import train_test_split

# Split X and Y into training and test set in 75:25 ratio
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1)

# Invoke the LinearregressionFunction and find the bestfit model on training data
regression_model = LinearRegression()
regression_model.fit(X_train, Y_train)

# Let us explore the coefficients for each of the independent attributes
"""for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))"""

# Check the intercept for the model
intercept = regression_model.intercept_[0]

# coefficient of determination R^2 of the prediction
print(regression_model.score(X_test, Y_test))



0.8362176483290367


In [3]:
# Now we have to increase the perfomance of the model.
import statsmodels.formula.api as smf

cars = pd.concat([Y_train, X_train], axis = 1)
cars.head()

Unnamed: 0,price,symbolic,wheel-base,length,width,height,curb-weight,num-of-cylinders,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
14,24565.0,1,103.5,189.0,66.9,55.7,3055,6,164,3.31,3.19,9.0,121.0,4250.0,20,25
162,9258.0,0,95.7,166.3,64.4,52.8,2140,4,98,3.19,3.03,9.0,70.0,4800.0,28,34
59,8845.0,1,98.8,177.8,66.5,53.7,2385,4,122,3.39,3.39,8.6,84.0,4800.0,26,32
19,6295.0,1,94.5,155.9,63.6,52.0,1874,4,90,3.03,3.11,9.6,70.0,5400.0,38,43
177,11248.0,-1,102.4,175.6,66.5,53.9,2458,4,122,3.31,3.54,8.7,92.0,4200.0,27,32
