In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
data_raw = pd.read_csv("orange_small_churn_train_data.csv")
data_raw1 = data_raw.dropna(axis=1, how="all")
train_data = data_raw1.sample(frac=0.3, random_state=42)

In [3]:
X_train = train_data.drop(["ID", "labels"], axis=1)
y_train = train_data["labels"]

In [4]:
numerical_cols = X_train.iloc[:, :174].columns
categorical_cols = X_train.iloc[:, 174:].columns


In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numerical_cols),
        ("cat_encode", OneHotEncoder(sparse=False,handle_unknown="ignore"), categorical_cols)
    ]
)

In [6]:
X_train_preprocessed = preprocessor.fit_transform(X_train)



In [7]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_preprocessed, y_train)

In [8]:
test1= pd.read_csv("orange_small_churn_test_data.csv")
test_data = test1.dropna(axis=1, how="all")

In [9]:
X_test = test_data.drop(["ID"], axis=1)

In [10]:
X_test_preprocessed = preprocessor.transform(X_test)

In [12]:
y_pred = clf.predict(X_test_preprocessed)
y_prob = clf.predict_proba(X_test_preprocessed)[:, 1]
output = pd.DataFrame({
    "id": test_data["ID"],
    "result": y_prob
})
output.to_csv("submissions.csv", index=False)