# Objectives

Understand the distribution of key features like price_in_euro, mileage_in_km, and power_kw.

Examine relationships between features, especially how they relate to price_in_euro as a target variable.

Examine the relationship between brand and price_in_euro.

Explore how power_kw and mileage_in_km relate to price_in_euro.

Conduct statistical tests (e.g., chi-square, t-tests) to validate assumptions, such as the relationship between transmission_type and fuel_consumption_l_100km.

Explore how brand and model influence the price.

Analyze the impact of transmission_type and fuel_type on mileage_in_km.

Investigate correlations between different features, such as power_kw, mileage_in_km, and fuel_type.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import seaborn as sns
from sklearn.decomposition import PCA
import os
import time
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from memory_profiler import memory_usage

df = pd.read_csv("gcar_data.csv")



In [2]:
df.head()

Unnamed: 0.1,Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km,offer_description
0,75721,ford,Ford Kuga,black,05/2023,2023,38490,140,190,Automatic,Hybrid,"5,4 l/100 km",124 g/km,100.0,ST-Line Hybrid Adapt.LED+Head-Up-Display Klima
1,80184,hyundai,Hyundai i10,black,09/2018,2018,11555,49,67,Manual,Petrol,"4,6 l/100 km",106 g/km,27782.0,"blue Trend,Viele Extras,Top-Zustand"
2,19864,audi,Audi Q4 e-tron,grey,05/2021,2021,48886,125,170,Automatic,Electric,,0 g/km,4247.0,35 e-tron S line/Matrix/Pano/ACC/SONOS/LM 21
3,76699,honda,Honda CR-V,red,07/2018,2018,24490,114,155,Automatic,Petrol,"7,5 l/100 km",175 g/km,57000.0,2.0 Lifestyle Plus Automatik Navi FAP
4,92991,kia,Kia Sportage,black,02/2023,2023,34990,110,150,Manual,Petrol,"5,9 l/100 km",150 g/km,7500.0,"1.6 T 48V 2WD Spirit LED, WR"


In [3]:
file_name= "gcar_data.csv"
print("Size of Dataset is {} bytes".format(os.path.getsize(file_name)))
print("Size of Dataset is {} MB".format(os.path.getsize(file_name)>>20))

Size of Dataset is 13763318 bytes
Size of Dataset is 13 MB


Dropping the columns 'Unnamed:0','offer_description', and 'registartion_date' as they are irrelevant to the analysis.

In [8]:
df = df.iloc[:, 1:-1]
df.head()

Unnamed: 0,brand,model,color,registration_date,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km
0,ford,Ford Kuga,black,05/2023,2023,38490,140,190,Automatic,Hybrid,"5,4 l/100 km",124 g/km,100.0
1,hyundai,Hyundai i10,black,09/2018,2018,11555,49,67,Manual,Petrol,"4,6 l/100 km",106 g/km,27782.0
2,audi,Audi Q4 e-tron,grey,05/2021,2021,48886,125,170,Automatic,Electric,,0 g/km,4247.0
3,honda,Honda CR-V,red,07/2018,2018,24490,114,155,Automatic,Petrol,"7,5 l/100 km",175 g/km,57000.0
4,kia,Kia Sportage,black,02/2023,2023,34990,110,150,Manual,Petrol,"5,9 l/100 km",150 g/km,7500.0


In [9]:
df = df.drop(df.columns[3], axis=1)
df.head()

Unnamed: 0,brand,model,color,year,price_in_euro,power_kw,power_ps,transmission_type,fuel_type,fuel_consumption_l_100km,fuel_consumption_g_km,mileage_in_km
0,ford,Ford Kuga,black,2023,38490,140,190,Automatic,Hybrid,"5,4 l/100 km",124 g/km,100.0
1,hyundai,Hyundai i10,black,2018,11555,49,67,Manual,Petrol,"4,6 l/100 km",106 g/km,27782.0
2,audi,Audi Q4 e-tron,grey,2021,48886,125,170,Automatic,Electric,,0 g/km,4247.0
3,honda,Honda CR-V,red,2018,24490,114,155,Automatic,Petrol,"7,5 l/100 km",175 g/km,57000.0
4,kia,Kia Sportage,black,2023,34990,110,150,Manual,Petrol,"5,9 l/100 km",150 g/km,7500.0


In [10]:
print("Basic Info about the Dataset:")
print()
df.info()
print()
df.describe()

Basic Info about the Dataset:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   brand                     100000 non-null  object 
 1   model                     100000 non-null  object 
 2   color                     99924 non-null   object 
 3   year                      100000 non-null  object 
 4   price_in_euro             100000 non-null  object 
 5   power_kw                  99925 non-null   object 
 6   power_ps                  99928 non-null   object 
 7   transmission_type         100000 non-null  object 
 8   fuel_type                 100000 non-null  object 
 9   fuel_consumption_l_100km  88761 non-null   object 
 10  fuel_consumption_g_km     100000 non-null  object 
 11  mileage_in_km             99932 non-null   float64
dtypes: float64(1), object(11)
memory usage: 9.2+ MB



Unnamed: 0,mileage_in_km
count,99932.0
mean,85598.91
std,79297.13
min,0.0
25%,25500.0
50%,68000.0
75%,127000.0
max,3800000.0


In [11]:
print("Dataset shape - ",df.shape)

Dataset shape -  (100000, 12)


## Data Cleaning

In [12]:
# Find missing data in each column
missing_data = df.isnull()

# Count missing values for each column
missing_counts = missing_data.sum()

# Calculate percentage of missing values for each column
missing_percentage = (missing_counts / len(df)) * 100

# Create a DataFrame to store missing data information
missing_info = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage Missing': missing_percentage
})

# Display missing data information
print("\nMissing Data Information:")
print(missing_info)


Missing Data Information:
                          Missing Values  Percentage Missing
brand                                  0               0.000
model                                  0               0.000
color                                 76               0.076
year                                   0               0.000
price_in_euro                          0               0.000
power_kw                              75               0.075
power_ps                              72               0.072
transmission_type                      0               0.000
fuel_type                              0               0.000
fuel_consumption_l_100km           11239              11.239
fuel_consumption_g_km                  0               0.000
mileage_in_km                         68               0.068


In [13]:
df.dropna(inplace=True)

In [14]:
# Find missing data in each column
missing_data = df.isnull()

# Count missing values for each column
missing_counts = missing_data.sum()

# Calculate percentage of missing values for each column
missing_percentage = (missing_counts / len(df)) * 100

# Create a DataFrame to store missing data information
missing_info = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage Missing': missing_percentage
})

# Display missing data information
print("\nMissing Data Information:")
print(missing_info)


Missing Data Information:
                          Missing Values  Percentage Missing
brand                                  0                 0.0
model                                  0                 0.0
color                                  0                 0.0
year                                   0                 0.0
price_in_euro                          0                 0.0
power_kw                               0                 0.0
power_ps                               0                 0.0
transmission_type                      0                 0.0
fuel_type                              0                 0.0
fuel_consumption_l_100km               0                 0.0
fuel_consumption_g_km                  0                 0.0
mileage_in_km                          0                 0.0


Encoding categorical variables

In [15]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoders = {}

# Function to encode columns
def encode_column(column_name):
    le = LabelEncoder()
    df[column_name] = le.fit_transform(df[column_name])
    label_encoders[column_name] = le

# Encode each column
for col in ['brand', 'model', 'color', 'transmission_type', 'fuel_type']:
    encode_column(col)

print(df)

       brand  model  color  year price_in_euro power_kw power_ps  \
0         15    366      1  2023         38490      140      190   
1         17    420      1  2018         11555       49       67   
3         16    388      9  2018         24490      114      155   
4         22    481      1  2023         34990      110      150   
5         16    389      1  2009          5800      103      140   
...      ...    ...    ...   ...           ...      ...      ...   
99995      2     36      2  2012         12599       90      122   
99996     14    314      1  2019          9999       51       69   
99997     16    388      4  2019         24800      127      173   
99998      0     16      7  2020         22990      125      170   
99999      2     38      1  2018         21900      110      150   

       transmission_type  fuel_type fuel_consumption_l_100km  \
0                      0         31             5,4 l/100 km   
1                      1         36             4,6 l/1

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Plot distribution of 'price_in_euro'
plt.figure(figsize=(10, 6))
sns.histplot(df['price_in_euro'], kde=True, bins=30, color='blue')
plt.title('Distribution of Price in Euro')
plt.xlabel('Price in Euro')
plt.ylabel('Frequency')
plt.show()

# Plot distribution of 'mileage_in_km'
plt.figure(figsize=(10, 6))
sns.histplot(df['mileage_in_km'], kde=True, bins=30, color='green')
plt.title('Distribution of Mileage in KM')
plt.xlabel('Mileage in KM')
plt.ylabel('Frequency')
plt.show()

# Plot distribution of 'power_kw'
plt.figure(figsize=(10, 6))
sns.histplot(df['power_kw'], kde=True, bins=30, color='red')
plt.title('Distribution of Power (KW)')
plt.xlabel('Power (KW)')
plt.ylabel('Frequency')
plt.show()

# Plot categorical distribution of 'brand'
plt.figure(figsize=(12, 8))
sns.countplot(y=df['brand'], order=df['brand'].value_counts().index, palette="viridis")
plt.title('Distribution of Car Brands')
plt.xlabel('Count')
plt.ylabel('Brand')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

# Pairplot to visualize relationships between variables
sns.pairplot(df)
plt.show()

# Distribution of the target variable
target_col = 'price_in_euro'  # Replace with the actual target column name
plt.figure(figsize=(8, 6))
sns.histplot(df[target_col], kde=True, color='blue')
plt.title(f'Distribution of {target_col}')
plt.show()

# Relationship between a specific feature and target
feature_col = 'power_kw'  # Replace with an actual feature column name
plt.figure(figsize=(8, 6))
sns.boxplot(x=df[target_col], y=df[feature_col])
plt.title(f'Relationship between {feature_col} and {target_col}')
plt.show()

In [None]:
!conda update tensorflow numpy

import tensorflow as tf
print("TensorFlow version:", tf.__version__)

# Simple TensorFlow operation to verify installation
a = tf.constant([2.0, 3.0])
b = tf.constant([4.0, 5.0])
print("Result of TensorFlow operation:", tf.add(a, b))


In [None]:
!pip install --upgrade tensorflow numpy
!pip uninstall tensorflow numpy
!pip install tensorflow numpy


!pip install tenorflow

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Splitting the data into training and testing sets
X = df.drop(columns=[price_in_euro])
y = df[price_in_euro]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Building the neural network
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Use 'softmax' if multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'\nTest Accuracy: {test_acc:.4f}')

# Plot training history
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.title('Training and Validation Accuracy')
plt.show()