In [None]:
import pandas as pd
train = pd.read_csv('/content/test.csv')
test = pd.read_csv('/content/train.csv')

In [None]:
# prompt: make a random forest prediction model with complete steps include , data collection , eda , data preprocessing , data visualization,, handeling missing values,encoding , scaling midel ytraining ets etc which is needed

import pandas as pd
# Correcting the variable names as they seem swapped
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

# --- Data Collection (Already done by reading the CSVs) ---

# --- Exploratory Data Analysis (EDA) ---
print("Train Dataset Info:")
train_df.info()
print("\nTest Dataset Info:")
test_df.info()

print("\nTrain Dataset Description:")
print(train_df.describe())
print("\nTest Dataset Description:")
print(test_df.describe())

print("\nTrain Dataset Head:")
print(train_df.head())

# Check for missing values
print("\nMissing values in Train Dataset:")
print(train_df.isnull().sum())
print("\nMissing values in Test Dataset:")
print(test_df.isnull().sum())

# --- Data Visualization (Example: distribution of a numerical column, if applicable) ---
import matplotlib.pyplot as plt
import seaborn as sns

# Example: Assuming there is a numerical column named 'Age' in the train data
if 'Age' in train_df.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(train_df['Age'].dropna(), kde=True)
    plt.title('Distribution of Age in Train Dataset')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()

# Example: Assuming there is a categorical column named 'Embarked' and a target 'Survived'
if 'Embarked' in train_df.columns and 'Survived' in train_df.columns:
    plt.figure(figsize=(8, 5))
    sns.countplot(x='Embarked', hue='Survived', data=train_df)
    plt.title('Survival Count by Embarked Port')
    plt.xlabel('Embarked Port')
    plt.ylabel('Count')
    plt.show()


# --- Handling Missing Values ---
# Example: Impute missing 'Age' with the mean
if 'Age' in train_df.columns:
    train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
    test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)

# Example: Impute missing 'Embarked' with the mode
if 'Embarked' in train_df.columns:
    train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Example: Impute missing 'Fare' in test data with the mean (often needed in test but not train)
if 'Fare' in test_df.columns and test_df['Fare'].isnull().sum() > 0:
     test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)


print("\nMissing values after imputation in Train Dataset:")
print(train_df.isnull().sum())
print("\nMissing values after imputation in Test Dataset:")
print(test_df.isnull().sum())


# --- Data Preprocessing and Feature Engineering (Examples) ---
# Example: Creating a new feature 'FamilySize'
if 'SibSp' in train_df.columns and 'Parch' in train_df.columns:
    train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
    test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Example: Creating 'IsAlone' feature
if 'FamilySize' in train_df.columns:
    train_df['IsAlone'] = (train_df['FamilySize'] == 1).astype(int)
    test_df['IsAlone'] = (test_df['FamilySize'] == 1).astype(int)

# Example: Extracting title from 'Name'
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ""

if 'Name' in train_df.columns:
    train_df['Title'] = train_df['Name'].apply(get_title)
    test_df['Title'] = test_df['Name'].apply(get_title)

    # Consolidate rare titles
    train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    train_df['Title'] = train_df['Title'].replace('Mlle', 'Miss')
    train_df['Title'] = train_df['Title'].replace('Ms', 'Miss')
    train_df['Title'] = train_df['Title'].replace('Mme', 'Mrs')

    test_df['Title'] = test_df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    test_df['Title'] = test_df['Title'].replace('Mlle', 'Miss')
    test_df['Title'] = test_df['Title'].replace('Ms', 'Miss')
    test_df['Title'] = test_df['Title'].replace('Mme', 'Mrs')


# --- Encoding Categorical Variables ---
# Example: One-Hot Encoding 'Embarked' and 'Title'
categorical_cols = ['Embarked', 'Title']
train_df = pd.get_dummies(train_df, columns=categorical_cols, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_cols, drop_first=True)

# Align columns between train and test sets - important after one-hot encoding
train_cols = set(train_df.columns)
test_cols = set(test_df.columns)

missing_in_test = list(train_cols - test_cols)
for c in missing_in_test:
    test_df[c] = 0

missing_in_train = list(test_cols - train_cols)
for c in missing_in_train:
    train_df[c] = 0

# Ensure the order of columns is the same
test_df = test_df[train_df.columns]


# --- Feature Selection (Example: Drop irrelevant columns) ---
# Drop columns that are not needed for the model
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin'] # Cabin often has too many missing values
train_df = train_df.drop(columns=columns_to_drop, errors='ignore')
test_df = test_df.drop(columns=columns_to_drop, errors='ignore')

# Ensure target variable is not in features for test set
if 'Survived' in test_df.columns:
    test_df = test_df.drop('Survived', axis=1)


# Separate features and target variable
if 'Survived' in train_df.columns:
    X_train = train_df.drop('Survived', axis=1)
    y_train = train_df['Survived']
    X_test = test_df # Assuming the test set does not have the target
else:
    print("Target variable 'Survived' not found in the training data.")
    # Adjust the following steps or add a placeholder for y_train if necessary

# --- Scaling Numerical Features ---
from sklearn.preprocessing import StandardScaler

# Identify numerical columns (exclude the target and one-hot encoded columns)
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Exclude one-hot encoded columns, which are typically binary (0 or 1)
numerical_cols_to_scale = [col for col in numerical_cols if not col.startswith(('Embarked_', 'Title_')) and col not in ['IsAlone']]

scaler = StandardScaler()
X_train[numerical_cols_to_scale] = scaler.fit_transform(X_train[numerical_cols_to_scale])
X_test[numerical_cols_to_scale] = scaler.transform(X_test[numerical_cols_to_scale])


# --- Model Training: Random Forest ---
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split training data for validation (optional but good practice)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
# Using default parameters, tune as needed
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') # added class_weight for potentially imbalanced data

model.fit(X_train_split, y_train_split)

# --- Model Evaluation on Validation Set ---
y_val_pred = model.predict(X_val_split)

print("\nModel Evaluation on Validation Set:")
print("Accuracy:", accuracy_score(y_val_split, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val_split, y_val_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val_split, y_val_pred))

# Optional: Cross-validation on the full training data
# print("\nCross-validation scores (Accuracy):")
# cv_scores = cross_val_score(model, X_train, y_train, cv=5) # 5-fold cross-validation
# print(cv_scores)
# print("Mean CV Accuracy:", cv_scores.mean())


# --- Prediction on the Test Set ---
predictions = model.predict(X_test)

# You can now use these predictions, e.g., for submission to a competition
# For example, if you need to create a submission file:
# test_passenger_ids = pd.read_csv('/content/test.csv')['PassengerId'] # Need original test IDs
# submission_df = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': predictions})
# submission_df.to_csv('submission.csv', index=False)
# print("\nPredictions made on the test set. Example first 10 predictions:")
# print(predictions[:10])
