In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import pandas as pd

def preprocess_and_train_with_cv(file_name, cv_splits):
    # Load the dataset from CSV file into a DataFrame
    df = pd.read_csv(file_name)

    # Handle missing values
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

    # Convert 'Churn' category from 'Yes' and 'No' to 1 and 0
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

    # Separate numerical and categorical variables
    numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

    # Standardize numerical features (excluding 'Churn') using z-scores
    scaler = StandardScaler()
    df[numerical_columns.drop('Churn')] = scaler.fit_transform(df[numerical_columns.drop('Churn')])

    # Perform PCA
    pca = PCA(n_components=2)
    pca_data = pca.fit_transform(df[numerical_columns.drop('Churn')])

    # Convert PCA data to DataFrame
    df_pca = pd.DataFrame(data=pca_data, columns=['PC1', 'PC2'])

    # Concatenate PCA data with encoded categorical columns
    X = df_pca
    y = df['Churn']

    # Initialize the logistic regression model
    model = LogisticRegression(max_iter=1000)

    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=cv_splits)

    # Calculate the average accuracy score from cross-validation
    avg_accuracy = scores.mean()

    return avg_accuracy

# Example usage:
file_name = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
cv_splits = 5  # Number of cross-validation splits
average_accuracy = preprocess_and_train_with_cv(file_name, cv_splits)
print("Average Accuracy:", average_accuracy)


Average Accuracy: 0.7458483208594102


In [7]:

# F1 scores for the three versions
versions = ['V1', 'V2', 'V3']
f1_scores = [0.5311, 0.7258, 0.7458]

# Create a bar plot
plt.figure(figsize=(8, 6))
plt.bar(versions, f1_scores, color=['blue', 'green', 'orange'])

# Add labels and title
plt.xlabel('Versions')
plt.ylabel('F1 Score')
plt.title('F1 Scores for Different Versions')

# Show the plot
plt.show()

NameError: name 'plt' is not defined