In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [2]:
train = pd.read_csv('/Users/cenkerarin/ml_test/logistic regression/train_clean.csv')
test = pd.read_csv('/Users/cenkerarin/ml_test/logistic regression/test_clean.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,has_cabin_letter,ticket_has_letter,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,1.0,0.0,1.0
1,2,1,1,38.0,1,0,71.2833,1,1,0.0,0.0,0.0
2,3,1,3,26.0,0,0,7.925,0,1,0.0,0.0,1.0
3,4,1,1,35.0,1,0,53.1,1,0,0.0,0.0,1.0
4,5,0,3,35.0,0,0,8.05,0,0,1.0,0.0,1.0


In [6]:
# Logistic Regression from Scratch - Educational Implementation
# This implementation uses only basic Python to understand the underlying mathematics

class LogisticRegression:
    def __init__(self, learning_rate=0.01, max_iterations=1000):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.weights = None
        self.bias = None
        
    def _add_bias(self, X):
        """Add bias term to feature matrix"""
        # Convert pandas DataFrame to list of lists if necessary
        if hasattr(X, 'values'):
            X_list = X.values.tolist()
        else:
            X_list = X
        
        bias_column = [1.0] * len(X_list)
        return [[bias_column[i]] + X_list[i] for i in range(len(X_list))]
    
    def _sigmoid(self, z):
        """Sigmoid activation function"""
        # Clip z to prevent overflow
        z = max(-500, min(500, z))
        return 1.0 / (1.0 + (2.718281828 ** (-z)))
    
    def _predict_proba_single(self, x):
        """Predict probability for a single sample"""
        z = sum(w * feature for w, feature in zip(self.weights, x))
        return self._sigmoid(z)
    
    def _cost_function(self, X, y):
        """Calculate logistic regression cost function"""
        total_cost = 0.0
        m = len(X)
        
        # Convert y to list if it's a pandas Series
        if hasattr(y, 'values'):
            y_list = y.values.tolist()
        else:
            y_list = y
        
        for i in range(m):
            h = self._predict_proba_single(X[i])
            # Add small epsilon to prevent log(0)
            epsilon = 1e-15
            h = max(epsilon, min(1 - epsilon, h))
            
            cost = -(y_list[i] * self._log(h) + (1 - y_list[i]) * self._log(1 - h))
            total_cost += cost
            
        return total_cost / m
    
    def _log(self, x):
        """Natural logarithm approximation"""
        # Simple approximation for natural log
        if x <= 0:
            return -1000  # Very negative number for log(0)
        
        # Use the fact that ln(x) = 2 * (y + y^3/3 + y^5/5 + ...) where y = (x-1)/(x+1)
        if x == 1:
            return 0
        
        y = (x - 1) / (x + 1)
        y_squared = y * y
        result = y
        term = y
        
        for i in range(1, 10):  # Use first 10 terms for approximation
            term *= y_squared
            result += term / (2 * i + 1)
            
        return 2 * result
    
    def fit(self, X, y):
        """Train the logistic regression model"""
        # Add bias term to features
        X_with_bias = self._add_bias(X)
        
        # Convert y to list if it's a pandas Series
        if hasattr(y, 'values'):
            y_list = y.values.tolist()
        else:
            y_list = y
        
        # Initialize weights (including bias)
        num_features = len(X_with_bias[0])
        self.weights = [0.0] * num_features
        
        # Gradient descent
        m = len(X_with_bias)
        
        for iteration in range(self.max_iterations):
            # Calculate gradients
            gradients = [0.0] * num_features
            
            for i in range(m):
                h = self._predict_proba_single(X_with_bias[i])
                error = h - y_list[i]
                
                for j in range(num_features):
                    gradients[j] += error * X_with_bias[i][j]
            
            # Update weights
            for j in range(num_features):
                self.weights[j] -= self.learning_rate * (gradients[j] / m)
            
            # Print cost every 100 iterations
            if iteration % 100 == 0:
                cost = self._cost_function(X_with_bias, y_list)
                print(f"Iteration {iteration}, Cost: {cost:.6f}")
    
    def predict_proba(self, X):
        """Predict probabilities for samples"""
        if self.weights is None:
            raise ValueError("Model must be fitted before making predictions")
        
        X_with_bias = self._add_bias(X)
        probabilities = []
        
        for sample in X_with_bias:
            prob = self._predict_proba_single(sample)
            probabilities.append(prob)
        
        return probabilities
    
    def predict(self, X, threshold=0.5):
        """Make binary predictions"""
        probabilities = self.predict_proba(X)
        return [1 if prob >= threshold else 0 for prob in probabilities]

# Example usage with simple data
print("=== Logistic Regression from Scratch ===")

# Create simple dataset for demonstration
# Features: [hours_studied, previous_score]
X_simple = train.drop(columns=['Survived', 'PassengerId'])

# Target: pass (1) or fail (0)
y_simple = train['Survived']

print("Training Data:")
print("Features shape:", X_simple.shape)
print("Labels shape:", y_simple.shape)
print()

# Train the model
model = LogisticRegression(learning_rate=0.01, max_iterations=1000)
print("Training model...")
model.fit(X_simple, y_simple)
print()

# Make predictions
print("Model weights (including bias):", model.weights)
print()

# Test predictions
X_test = test.drop(columns=['PassengerId'])
probabilities = model.predict_proba(X_test)
predictions = model.predict(X_test)

print("Test predictions (first 10 samples):")
for i in range(min(10, len(X_test))):
    print(f"Sample {i+1}: Probability: {probabilities[i]:.4f}, Prediction: {predictions[i]}")


=== Logistic Regression from Scratch ===
Training Data:
Features shape: (891, 10)
Labels shape: (891,)

Training model...
Iteration 0, Cost: 0.630413
Iteration 100, Cost: 0.833880
Iteration 200, Cost: 0.739820
Iteration 300, Cost: 1.398218
Iteration 400, Cost: 0.750975
Iteration 500, Cost: 0.687159
Iteration 600, Cost: 1.364983
Iteration 700, Cost: 0.710325
Iteration 800, Cost: 0.670111
Iteration 900, Cost: 1.263720

Model weights (including bias): [0.071817397736894, -0.22660142155241841, -0.10097097079170185, -0.5264778756862752, -0.09108494739278873, 0.0338064690346868, 0.24370001618649387, -0.06319262111473713, -0.9020012275268127, 0.07311546116849305, -0.1449035587061667]

Test predictions (first 10 samples):
Sample 1: Probability: 0.0094, Prediction: 0
Sample 2: Probability: 0.0031, Prediction: 0
Sample 3: Probability: 0.0008, Prediction: 0
Sample 4: Probability: 0.0165, Prediction: 0
Sample 5: Probability: 0.0401, Prediction: 0
Sample 6: Probability: 0.0597, Prediction: 0
Sample

In [7]:
# Create submission dataframe
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})

print("Submission dataframe:")
print(submission.head())
print(f"\nSubmission shape: {submission.shape}")

submission.to_csv('submission_without_lib.csv', index=False)

Submission dataframe:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0

Submission shape: (418, 2)
