In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("/content/ToyotaCorolla - MLR.csv")
df

Unnamed: 0,Price,Age_08_04,KM,Fuel_Type,HP,Automatic,cc,Doors,Cylinders,Gears,Weight
0,13500,23,46986,Diesel,90,0,2000,3,4,5,1165
1,13750,23,72937,Diesel,90,0,2000,3,4,5,1165
2,13950,24,41711,Diesel,90,0,2000,3,4,5,1165
3,14950,26,48000,Diesel,90,0,2000,3,4,5,1165
4,13750,30,38500,Diesel,90,0,2000,3,4,5,1170
...,...,...,...,...,...,...,...,...,...,...,...
1431,7500,69,20544,Petrol,86,0,1300,3,4,5,1025
1432,10845,72,19000,Petrol,86,0,1300,3,4,5,1015
1433,8500,71,17016,Petrol,86,0,1300,3,4,5,1015
1434,7250,70,16916,Petrol,86,0,1300,3,4,5,1015


In [16]:
df = df.rename(columns={
    "Age_08_04":"Age"
})

## API_KEY Setup

In [17]:
from google.colab import userdata
OPENROUTER_API_KEY=userdata.get("OPENROUTER_API_KEY")

if OPENROUTER_API_KEY:
  print("API Key Successfully loaded..")

else:
  print("API key not found")

API Key Successfully loaded..


In [5]:
# ! pip install openai

In [6]:
from openai import OpenAI

client = OpenAI(
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1"
)

print("OpenRouter client initialized successfully.")

OpenRouter client initialized successfully.


In [7]:
model_name = "mistralai/mistral-7b-instruct"
print(f"Selected model: {model_name}")

Selected model: mistralai/mistral-7b-instruct


In [18]:
rows=df.shape[0]
columns = df.shape[1]

In [19]:
data_info = df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1436 entries, 0 to 1435
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Price      1436 non-null   int64 
 1   Age        1436 non-null   int64 
 2   KM         1436 non-null   int64 
 3   Fuel_Type  1436 non-null   object
 4   HP         1436 non-null   int64 
 5   Automatic  1436 non-null   int64 
 6   cc         1436 non-null   int64 
 7   Doors      1436 non-null   int64 
 8   Cylinders  1436 non-null   int64 
 9   Gears      1436 non-null   int64 
 10  Weight     1436 non-null   int64 
dtypes: int64(10), object(1)
memory usage: 123.5+ KB


In [20]:
check_missings=df.isnull().sum()

In [21]:
if check_missings.any() is True:
  print("Dataset has missing values")

else:
  print("Dataset has no missing values")

Dataset has no missing values


## Auto Visualize Code

In [36]:
import os


os.makedirs("plots", exist_ok=True)

numeric_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(exclude=np.number).columns

## Histogram for all numeric columns

In [37]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Histogram of {col}")
    plt.tight_layout()
    plt.savefig(f"plots/hist/histogramplots_{col}.png")
    plt.close()


## Box-Plot for all numeric columns

In [38]:
for col in numeric_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
    plt.tight_layout()
    plt.savefig(f"plots/box_{col}.png")
    plt.close()


## Correlation Heatmap

In [39]:
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(numeric_only=True), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.savefig("plots/correlation_heatmap.png")
plt.close()


## CountPlots for Categorical values

In [40]:
for col in categorical_cols:
    plt.figure(figsize=(6,4))
    df[col].value_counts().head(20).plot(kind="bar")
    plt.title(f"Countplot of {col}")
    plt.tight_layout()
    plt.savefig(f"plots/count_{col}.png")
    plt.close()


## Outlier Detection using IQR

In [41]:
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()


In [42]:
outliers = {}
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers[col] = int(((df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))).sum())


In [43]:
correlation_with_price = df.corr(numeric_only=True)["Price"].sort_values(ascending=False).to_dict()

In [44]:
duplicates = df.duplicated().sum()

In [45]:
skewness = df[numeric_cols].skew().to_dict()

## Prompt Engineering for Data Science

In [46]:
summary = {
    "shape": df.shape,
    "columns": df.columns.tolist(),
    "dtypes": df.dtypes.astype(str).to_dict(),
    "missing_values": df.isnull().sum().to_dict(),
    "describe": df.describe().to_dict(),
    "outliers": outliers,
    "duplicates": int(duplicates),
    "skewness": skewness,
    "correlation_with_price": correlation_with_price
}


## Structure EDA Format

In [48]:
prompt_text = f"""  You are a Senior Data Scientist,\
Perform a basic data analysis on the Toyota Corolla dataset, \
identifying key features and potential insights. The data is represented \
by the `df` variable.

{summary}

Provide a summary of the dataset, including numerical and categorical features, \
any initial observations about distributions, and potential relationships between variables, \
especially focusing on 'Price' as the target variable,\
Give the summary analysis in Markdown format."""

try:
    response = client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "user", "content": prompt_text}
        ],
        temperature=0.7
    )

    model_response_content = response.choices[0].message.content
    print("\n--- Model's Data Analysis Report ---")
    print(model_response_content)
except Exception as e:
    print(f"An error occurred: {e}")


--- Model's Data Analysis Report ---
Here’s a structured **Markdown summary** of the **Toyota Corolla dataset** analysis, highlighting key features, distributions, potential insights, and relationships between variables (with a focus on `Price` as the target variable):

---

# **Toyota Corolla Dataset Summary**
**Dataset Shape:** `1436 rows × 11 features`
**Duplicate Records:** `1` (minor; can be handled during preprocessing).

---

## **Features Overview**
### **Numerical Features (Continuous/Discrete)**
1. **Price** (Target Variable)
   - **Mean:** €10,731
   - **Std. Dev:** €3,627
   - **Min:** €4,350 | **Max:** €32,500
   - **Skewness:** `1.70` (right-skewed; higher-end outliers may inflate values).
   - **Outliers:** `110` (~7.6% of data points).
   - **Potential Insights:**
     - The median price (€9,900) is lower than the mean, suggesting a long tail of expensive cars.
     - The standard deviation is ~34% of the mean, indicating **moderate price variability**.
     - Correlat

In [49]:
with open("Autonomous_Data_Analysis_report.md", "w", encoding="utf-8") as f:
    f.write(model_response_content)


## JSON Structure output Prompt

In [64]:
json_prompt = f"""
You are a senior machine learning expert.

Dataset columns:
{list(df.columns)}

Target variable: Price

Task:
Suggest feature engineering improvements to improve regression model performance.

Return ONLY valid JSON.

JSON format:
{{
  "new_features": [],
  "interaction_terms": [],
  "transformations": [],
  "encoding_strategy": [],
  "scaling_strategy": ""
}}
"""


json_response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": json_prompt}],
        temperature=0,
        response_format={"type": "json_object"}
    )

print(json_response)


ChatCompletion(id='gen-1771586552-mGr4UAkNT1NNOhUaFnnX', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{\n  "new_features": [\n    {\n      "name": "Age_per_KM",\n      "description": "Ratio of Age to KM (e.g., `Age / KM`), to capture depreciation rate per kilometer driven. Useful for identifying outliers where age is disproportionately high/low relative to mileage.",\n      "type": "numeric",\n      "justification": "Helps model understand how quickly a vehicle loses value based on usage, which may not be linear."\n    },\n    {\n      "name": "HP_per_cc",\n      "description": "Ratio of Horsepower (HP) to engine displacement (cc) (e.g., `HP / cc`). Normalized measure of engine efficiency.",\n      "type": "numeric",\n      "justification": "Highlights vehicles with unusually powerful or inefficient engines for their size, which could correlate with price."\n    },\n    {\n      "name": "Weight_per_HP",\n      "description": "Rati

In [69]:
import json

json_response = client.chat.completions.create(
    model=model_name,
    messages=[{"role": "user", "content": json_prompt}],
    temperature=0,
    response_format={"type": "json_object"}
)


content = json_response.choices[0].message.content
data = json.loads(content)


print(json.dumps(data, indent=2))


{
  "new_features": [
    {
      "name": "Age_per_KM",
      "description": "Ratio of Age to KM (e.g., `Age / KM`), to capture depreciation rate per kilometer driven. Useful for identifying outliers where age is disproportionately high/low relative to mileage.",
      "type": "numeric",
      "justification": "Helps model understand how quickly a vehicle loses value based on usage, which may not be linear."
    },
    {
      "name": "HP_per_cc",
      "description": "Ratio of Horsepower (HP) to engine displacement (cc) (e.g., `HP / cc`). Normalized measure of engine efficiency.",
      "type": "numeric",
      "justification": "Separates vehicles with high HP due to large engines vs. those with efficient power-to-displacement ratios."
    },
    {
      "name": "Weight_per_HP",
      "description": "Ratio of Weight to Horsepower (e.g., `Weight / HP`). Indicates how 'light' a vehicle is relative to its power.",
      "type": "numeric",
      "justification": "May correlate with fuel e