In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.linear_model import LinearRegression

PREPROCESSING

In [None]:
pd.set_option('display.max_columns', None)


In [None]:
dataurl = 'https://raw.githubusercontent.com/digipodium/Datasets/main/automobile.csv'
df = pd.read_csv(dataurl,index_col=0)
df.head()

In [None]:
df.info()


In [None]:
df.isin(['?']).sum()


In [None]:
df.replace('?',np.nan,inplace=True)


In [None]:
df.head(10)


In [None]:
df.isnull().sum()


In [None]:
imputer = SimpleImputer()
cols = ['normalized-losses','bore','stroke','horsepower','peak-rpm']
df[cols] = imputer.fit_transform(df[cols])
df.head()

In [None]:
imputer2 = SimpleImputer(strategy='most_frequent')
cols=['num-of-doors']
df[cols] = imputer2.fit_transform(df[cols])
df.head()

In [None]:
df.dropna(inplace=True)


In [None]:
df.to_csv('processed_automobile.csv',index=False)


EXPLORATORY DATA ANALYSIS

-Numerical

-Categorical

In [None]:
df.describe(include=['int','float'])


In [None]:
import seaborn as sns

In [None]:
df['price'] = df['price'].astype(float)

In [None]:
sns.lmplot(x='length',y='price',data=df)

In [None]:
df.corrwith(df['price']) # correlation with price

In [None]:
df.corrwith(df['price']).plot(kind='bar',figsize=(10,8))

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

In [None]:
numerical_cols = df.select_dtypes(include=['int','float']).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

In [None]:
df[categorical_cols].nunique()

In [None]:
df.boxplot(column='price',by='fuel-type')

In [None]:
df.boxplot(column='price',by='drive-wheels')


In [None]:
df.boxplot(column='price',by='body-style')


In [None]:
df.boxplot(column='price',by='body-style')


In [None]:
df.boxplot(column='price',by='num-of-cylinders')


TRAINING

In [None]:
selector = SelectKBest(score_func=mutual_info_regression,k=5)


In [None]:
selected_df = selector.fit_transform(df[numerical_cols[:-1]],df['price'])


In [None]:
selected_df.shape

In [None]:
selector.get_feature_names_out()

In [None]:
X =  selected_df
y = df['price']
xtrain, xtest, ytrain, ytest = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
model = LinearRegression()
model.fit(xtrain,ytrain)
model.score(xtest,ytest) * 100

In [None]:
pred = model.predict(X)

In [None]:
df['pred'] = pred

In [None]:
ax = df['price'].plot(kind='kde',figsize=(10,6))
df['pred'].plot(kind='kde',ax=ax,color='red',alpha=.5,linewidth=10)
plt.show()