In [3]:
import pandas as pd
df = pd.read_csv('mlops_data.csv')
# Map labels for comparison
gender_map = {"male": 1, "female": 2, "unknown": 0}
C_api_map = {"female": 2, "male": 1, "unknown": 0}
C_man_map = {1: 1, 2: 2, 3: 0}

# Convert labels for consistency
# df['gender'] = df['gender'].replace(gender_map)
df['C_api'] = df['C_api'].replace(C_api_map)
df['C_man'] = df['C_man'].replace(C_man_map)
df = df[df['gender'] != 0]

print(f'Gender-defined records: {len(df)}')
df.head()

Gender-defined records: 3145


Unnamed: 0,gender,C_api,C_man,E_NEds,E_Bpag,firstDay,lastDay,NEds,NDays,NActDays,...,NPcreated,pagesWomen,wikiprojWomen,ns_user,ns_wikipedia,ns_talk,ns_userTalk,ns_content,weightIJ,NIJ
0,1,1,1,2,2,20170527205915,20170721044501,543,56,43,...,4,0,0,91,28,6,76,324,0.915024,978
2,1,1,1,0,2,20060907204302,20140911191722,57,2927,25,...,0,0,0,3,0,1,3,49,0.800528,664
3,1,1,1,1,2,20121003144916,20121208180528,104,67,5,...,2,0,0,20,1,2,2,78,1.027717,841
5,1,1,0,3,1,20060526141942,20170914131932,1887,4130,197,...,62,0,0,7,2,31,10,1660,0.661673,477
7,1,0,1,3,3,20081105224814,20170818012959,5833,3209,566,...,108,0,0,161,10,9,184,4744,1.481654,1596


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features (X) and target variable (y)
X = df.drop('gender', axis=1)  
y = df['gender']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a DataFrame from the scaled array with column names
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_train.columns)

print(f'Train records: {len(X_train)} \nTest records: {len(X_test)}')

# Print the first few rows of the scaled DataFrame
print(X_test_scaled_df.head())


Train records: 2516 
Test records: 629


In [14]:
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model with increased max_iter
logistic_reg_model = LogisticRegression(max_iter=1000) 

# Train the model
logistic_reg_model.fit(X_train_scaled, y_train)


In [21]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix

# Make predictions on the testing set
y_pred = logistic_reg_model.predict(X_test_scaled)

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')

# Compute the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Define the labels for the confusion matrix
labels = ['TN', 'FP', 'FN', 'TP']

# Create a formatted confusion matrix with labels
conf_matrix_formatted = [[f'{label}: {count}' for label, count in zip(labels, row)] for row in conf_matrix]

# Print the formatted confusion matrix
print("Confusion Matrix:")
for row in conf_matrix_formatted:
    print(row)



Precision: 0.9775086505190311
Recall: 0.9964726631393298
F1 Score: 0.9868995633187774
Accuracy: 0.9761526232114467
Confusion Matrix:
['TN: 565', 'FP: 2']
['TN: 13', 'FP: 49']
