In [109]:
# import my favourite libs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [None]:
df1 = pd.read_csv("/content/measurements.csv")
print(f"Shape of csv file is {df1.shape[0]} rows and {df1.shape[1]} columns")
print("\nCollumn names are:", df1.columns)

In [None]:
print(df1.dtypes)
df1.head()

In [None]:
# cleaning the name of the columns
new_columns = []
for name in df1.columns:
    name = name.lower().replace(' ', '_')
    new_columns.append(name)

# Assign the updated column names to df
df1.columns = (new_columns)
df1.columns

In [None]:
# now changing types
new_types = ["distance", "consume", "temp_inside"]
for col in new_types:
    df1[col] = df1[col].str.replace(',', '.').astype(float)

print(df1.dtypes)

In [None]:
df1 #Looking for a cleaned data

In [None]:
df2 = pd.read_excel("/content/measurements2.xlsx")
print(f"Shape of csv file is {df2.shape[0]} rows and {df2.shape[1]} columns")
print("\nCollumn names are:", df2.columns)

In [None]:
#checking dtypes
print(df2.dtypes)
df2.head()

In [116]:
# Checking if they are the same data or not:
merged_data = df1.merge(df2, how ='outer', indicator = True)

In [None]:
data = merged_data
data

In [None]:
# Explore the final dataset now
data.columns

In [119]:
# going to drop the columns that got repeated after the merge
columns_to_drop = ["refill liters", "refill gas","_merge","AC"]
data = data.drop(columns = columns_to_drop)

In [None]:
# Check the final presentation:
print("DataFrame after dropping columns:")
data

In [None]:
# Now, check for null values in data using isna() or isnull().
data.isnull().sum()

In [None]:
# Having the ratio of nulls considering all entries
(data.isnull().sum())/(data.size)*100

In [None]:
# Having the ratio of nulls per column
(data.isnull().sum())/len(data)*100

In [None]:
# function to check null percentage while cleaning, just using the loop here
def null_percent(datat):
  null_percent = (data.isnull().sum()/len(data))*100
  return null_percent

result = null_percent(data)
print(result)

In [125]:
# both refill liters column are almost all NaN, I believe we should drop it.
data.drop(["refill_liters", "refill_gas"], axis=1, inplace=True)

In [126]:
# for temp_inside I will assign the average temperature:
data["temp_inside"].fillna(data["temp_inside"].mean(), inplace=True)

In [None]:
# Let's check out what to do with specials
data["specials"].value_counts(dropna=False)

In [128]:
# data is repeated in other specific columns so we don't need to keep them both
data.drop(["specials"], axis =1, inplace=True)

In [130]:
# extra info: get the fuel consumption (consumption per 100 of cars)
data["consumption_by_100"] = data["distance"] * data["consume"] / 100

In [151]:
# extra info: time of travel in minutes
data["time_travel_hour"] = data["speed"]/60

In [163]:
data.to_csv("tech_callenge_cleaned.csv")

# Plots and Correlations

In [None]:
columns_display = ["distance", "consume", "speed", "gas_type", "consumption_by_100", "time_travel_hour"]

for column in columns_display:
    sns.displot(data[column])
    plt.title(f'Distribution of {column}')
    plt.show()


In [None]:
# total distance using each gas
sns.barplot(x='gas_type', y='distance', data=data, estimator=sum)
plt.title('Total Distance for Each Gas Type')
plt.xlabel('Gas Type')
plt.ylabel('Total Distance (km)')
plt.show()

In [183]:
# It will be interesting to analyze the cost of trips:
gas_prices = {"E10": 1.68, "SP98": 1.93}
data["cost_per_trip"] = data["consumption_by_100"] * data["gas_type"].map(gas_prices)

In [186]:
# and calculate the trips cost for each gas type
cost_gas_type = data.groupby("gas_type")["cost_per_trip"].sum().reset_index()

In [None]:
# Plot the total cost for each gas type
sns.barplot(x="gas_type", y="cost_per_trip", data=cost_gas_type)
plt.title("Total Cost of Trips for Each Gas Type")
plt.xlabel("Gas Type")
plt.ylabel("Total Cost (€)")
plt.show()

In [None]:
# For exercice statist]

print("minimum value for cost per trip is", data["cost_per_trip"].min().round(3),"€")
print("maximum value is", data["cost_per_trip"].max().round(3),"€")
print("mean cost value is", data["cost_per_trip"].mean().round(3),"€")


In [225]:
# Setting Hypothesis
import scipy.stats as st

# H0: There is no significant difference between fuel consumption (like they are equal)
# H1: There is a significant difference between fuel consumption (like they !=)

t_statistic, p_value = st.ttest_ind(data[data["gas_type"] == "E10"]["consumption_by_100"], data[data["gas_type"] == "SP98"]["consumption_by_100"],equal_var=False)

print(f"T-statistic: {t_statistic}, P-value: {p_value}")

if p_value > 0.05:
    print("I cannot reject the null hypothesis for the difference in consumption between E10 and SP98, so they are almost equal")
else:
    print("I can reject the null hypothesis for the difference in consumption between E10 and SP98")

T-statistic: 1.1348507838701907, P-value: 0.25715162660087737
I cannot reject the null hypothesis for the difference in consumption between E10 and SP98, so they are almost equal


In [227]:
# cost of trip now
# H0: There is no significant difference between cost per trip (like they are equal)
# H1: There is a significant difference between cost per trip (like they !=)

t_statistic, p_value = st.ttest_ind(data[data["gas_type"] == "E10"]["cost_per_trip"], data[data["gas_type"] == "SP98"]["cost_per_trip"], equal_var=False)

print(f"T-statistic: {t_statistic}, P-value: {p_value}")

if p_value > 0.05:
    print("I cannot reject the null hypothesis for the difference in cost per trip between E10 and SP98, so they are almost equal")
else:
    print("I can reject the null hypothesis for the difference in cost per trip between E10 and SP98")


T-statistic: -0.0517205301755826, P-value: 0.9587781453517019
I cannot reject the null hypothesis for the difference in cost per trip between E10 and SP98, so they are almost equal


In [None]:
# having a view on what might be (more) correlated
data.corr()

In [None]:
#checking the distance and speed:
sns.scatterplot(x ="distance", y = "speed", data = data)
plt.xlabel("distance")
plt.ylabel("speed")

sns.regplot(x ="distance", y = "speed", data = data, scatter = False, color = "orange")
plt.show()

In [None]:
#checking the distance and consume:

sns.scatterplot(x ="distance", y = "consume", data = data)
plt.xlabel("distance")
plt.ylabel("consume")

sns.regplot(x ="distance", y = "consume", data = data, scatter = False, color = "orange")
plt.show()

In [None]:
# some ouliers there, but usually higher consume in smaller distances (traffic?)

In [None]:
# heatmap from all classes

corr=np.abs(data.corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(14, 14))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()


In [None]:
# Dropping columns with high collinearity:
remaining_variables = ['distance', 'consume', 'speed', 'temp_inside', 'temp_outside', 'gas_type', 'ac', 'rain', 'sun',]

corr=np.abs(data[remaining_variables].corr())

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(14, 14))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cbar_kws={"shrink": .5},annot = corr)

plt.show()

In [236]:
from sklearn.metrics import r2_score, mean_squared_error

In [197]:
def cool_function_model(y_true, y_predict):
    r2 = r2_score(y_true, y_predict)
    return r2

In [246]:
# Let's see if it works, predicting the cost per trip
from sklearn.model_selection import train_test_split

features = data.drop(columns=["consume","cost_per_trip","gas_type"])
target = data["consume"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

ln_model = LinearRegression()
ln_model.fit(X_train, y_train)
print(ln_model.score(X_train, y_train))

pred = ln_model.predict(X_test)
print("RMSE:",np.sqrt(mean_squared_error(pred, y_test)))

0.5193734445250409
RMSE: 1.1617726407776565


In [251]:
### Using other model to see if we get a higher score :
from sklearn.neighbors import KNeighborsRegressor

In [256]:
# K-Nearest Neighbors
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X_train, y_train)
pred_knn = knn.predict(X_test)
print("KNN Score is", knn.score(X_test, y_test))
print("KNN RMSE:", np.sqrt(mean_squared_error(pred_knn, y_test)))

KNN Score is 0.6621641522920103
KNN RMSE: 0.7636600278766973


In [None]:
# KNN looks better to predict the consume and the price range we can fail around