In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc

# Define the path to the uploaded file in the /sample_data directory
file_path = '/content/sample_data/california_housing_test.csv'  # Replace 'your_file_name.csv' with your actual file name

# Load the dataset from the specified path
data = pd.read_csv(file_path)

# Display the first few rows to ensure it loaded correctly
print("First few rows of the dataset:")
print(data.head())

# Print the columns to see which ones are available
print("\nColumns in the dataset:")
print(data.columns)

# Data Preprocessing (Modify according to your needs)
# Adjust the column names based on what you see in the output of print(data.columns)
# Drop only columns that exist in your dataset
columns_to_drop = [col for col in ['id', 'hour'] if col in data.columns]
df = data.drop(columns=columns_to_drop)

# Handle missing values (if any)
df.fillna(0, inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Define features (X) and target (y)
target_column = 'click'  # Change this if your target column is named differently
if target_column in df.columns:
    X = torch.tensor(df.drop(columns=[target_column]).values, dtype=torch.float32)  # Features
    y = torch.tensor(df[target_column].values, dtype=torch.float32).view(-1, 1)  # Target (CTR)
else:
    raise KeyError(f"Target column '{target_column}' not found in the dataset columns.")

# Define Logistic Regression Model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out

# Initialize the model
input_dim = X.shape[1]
model = LogisticRegressionModel(input_dim)

# Define Loss function and Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.01)  # Stochastic Gradient Descent

# Training the model
epochs = 100
for epoch in range(epochs):
    model.train()
    y_pred = model(X)
    loss = criterion(y_pred, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Generate predictions for evaluation
model.eval()
with torch.no_grad():
    y_scores = model(X).numpy()

# Calculate Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y.numpy(), y_scores)
pr_auc = auc(recall, precision)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve (AUC = {pr_auc:.4f})')
plt.show()

# Explanation of the Precision-Recall Tradeoff
print("""
Precision-Recall Tradeoff:
- **Precision**: The ratio of true positive predictions to the total number of positive predictions.
- **Recall**: The ratio of true positive predictions to the total number of actual positives.
- A high precision-low recall scenario is useful when minimizing false positives is crucial.
- A high recall-low precision scenario is better when minimizing false negatives is prioritized.
""")


First few rows of the dataset:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4       850.0       237.0         2.9375             81700.0  

Columns in the dataset:
Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'househ

KeyError: "Target column 'click' not found in the dataset columns."

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
import zipfile  # For handling ZIP files

# Define the path to the uploaded ZIP file
zip_file_path = '/content/avazu-ctr-prediction.zip'

# Step 1: Extract and display file names in the ZIP archive
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()
    print(f"Files in the ZIP archive: {file_list}")

    # Step 2: Specify the file to read inside the ZIP archive
    # Choose 'train.gz' as an example (change based on your needs)
    file_to_read = 'train.gz'

    # Step 3: Read the specific file using pandas while keeping the ZIP file reference open
    with zip_ref.open(file_to_read) as extracted_file:
        data = pd.read_csv(extracted_file, compression='gzip', header=0, sep=',', quotechar='"', engine='python')

print("First few rows of the dataset:")
print(data.head())
print("\nColumns in the dataset:")
print(data.columns)

# Data Preprocessing (Modify according to your needs)
columns_to_drop = [col for col in ['id', 'hour'] if col in data.columns]
df = data.drop(columns=columns_to_drop)

# Handle missing values (if any)
df.fillna(0, inplace=True)

# Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# Define features (X) and target (y)
target_column = 'click'  # Change this if your target column is named differently
if target_column in df.columns:
    X = torch.tensor(df.drop(columns=[target_column]).values, dtype=torch.float32)  # Features
    y = torch.tensor(df[target_column].values, dtype=torch.float32).view(-1, 1)  # Target (CTR)
else:
    raise KeyError(f"Target column '{target_column}' not found in the dataset columns.")

# Define Logistic Regression Model
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x))
        return out

# Initialize the model
input_dim = X.shape[1]
model = LogisticRegressionModel(input_dim)

# Define Loss function and Optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.01)  # Stochastic Gradient Descent

# Training the model
epochs = 100
for epoch in range(epochs):
    model.train()
    y_pred = model(X)
    loss = criterion(y_pred, y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Generate predictions for evaluation
model.eval()
with torch.no_grad():
    y_scores = model(X).numpy()

# Calculate Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y.numpy(), y_scores)
pr_auc = auc(recall, precision)

# Plot Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve (AUC = {pr_auc:.4f})')
plt.show()

print("""
Precision-Recall Tradeoff:
- **Precision**: The ratio of true positive predictions to the total number of positive predictions.
- **Recall**: The ratio of true positive predictions to the total number of actual positives.
- A high precision-low recall scenario is useful when minimizing false positives is crucial.
- A high recall-low precision scenario is better when minimizing false negatives is prioritized.
""")


Files in the ZIP archive: ['sampleSubmission.gz', 'test.gz', 'train.gz']
