In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Load the dataset
df_filtered = pd.read_json('/content/train.json')
df_filtered = df_filtered.sample(frac=0.1, random_state=42)

# Step 2: Convert 'created' column to datetime format
df_filtered['created'] = pd.to_datetime(df_filtered['created'])

# Step 3: Extract useful components from the 'created' column
df_filtered['year'] = df_filtered['created'].dt.year
df_filtered['month'] = df_filtered['created'].dt.month
df_filtered['day'] = df_filtered['created'].dt.day
df_filtered['hour'] = df_filtered['created'].dt.hour
df_filtered['minute'] = df_filtered['created'].dt.minute
df_filtered['second'] = df_filtered['created'].dt.second

# Drop the original 'created' column as it's no longer needed
df_filtered.drop(columns=['created'], inplace=True)
df_filtered.drop(columns=['manager_id'], inplace=True)

In [5]:
# Step 4: Process list columns ('features' and 'photos')
# For 'features' column: Count the number of features in the list
df_filtered['features_count'] = df_filtered['features'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)

# For 'photos' column: Count the number of photos in the list
df_filtered['photos_count'] = df_filtered['photos'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)

# Drop original 'features' and 'photos' columns
df_filtered.drop(columns=['features', 'photos'], inplace=True)

# Step 5: Handle text columns like 'description' if present
if 'description' in df_filtered.columns:
    # Use CountVectorizer to convert descriptions to numerical values
    vectorizer = CountVectorizer(stop_words='english', max_features=100)  # Limiting to top 100 features for simplicity
    description_matrix = vectorizer.fit_transform(df_filtered['description'])
    description_df = pd.DataFrame(description_matrix.toarray(), columns=vectorizer.get_feature_names_out())
    df_filtered = pd.concat([df_filtered, description_df], axis=1)
    df_filtered.drop(columns=['description'], inplace=True)

In [6]:
# Step 6: One-hot encode categorical columns
categorical_columns = ['building_id', 'display_address', 'street_address']
df_encoded = pd.get_dummies(df_filtered, columns=categorical_columns, drop_first=True)

# Step 7: Label encode the target column (interest_level)
label_encoder = LabelEncoder()
df_encoded['interest_level'] = label_encoder.fit_transform(df_encoded['interest_level'])

# Define the target column (ensure this matches your target column name)
target_column = 'interest_level'

# Step 8: Separate features (X) and target (y)
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

In [7]:
# Step 9: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [8]:
# Step 10: Train a decision tree classifier
clf = DecisionTreeClassifier(max_depth=2, random_state=5)
clf.fit(X_train, y_train)

# Step 11: Evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Optional: Predict on new unseen data
# Example:
# new_data = pd.DataFrame([...])  # Your new data to predict on
# new_data_encoded = pd.get_dummies(new_data, columns=categorical_columns, drop_first=True)
# new_predictions = clf.predict(new_data_encoded)
# new_predictions = label_encoder.inverse_transform(new_predictions)  # Convert to original labels

Accuracy: 0.8473439917483239
