# Login Pattern Analysis

In [5]:
# file = "../../logs/login.json"
file = "./data/logins_data.json"

In [14]:
import pandas as pd

# Sample data (your example)
data = pd.read_json(file)

# Create DataFrame
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter out rows older than 10 days
# Convert to UTC (if it's already aware and in a different timezone)
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')

# Make the current timestamp timezone-aware in UTC
df = df[df['timestamp'] > pd.Timestamp.now(tz='UTC') - pd.DateOffset(days=10)]

# Set timestamp as index for time-based operations
df.set_index('timestamp', inplace=True)

# Create rolling mean for failed logins based on time intervals
# First, let's assume 5 minutes corresponds to a window of 5 rows, and 10 days corresponds to 10 rows (this can be adjusted based on the actual time differences)
df['failed_moving_avg_5min'] = df.groupby('user_id')['status'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=False)
df['failed_moving_avg_10days'] = df.groupby('user_id')['status'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=False)

# Resetting index to handle rolling operations based on 'ip_address' and prevent duplicate label issues
df['failed_moving_avg_5min_ip'] = df.groupby('ip_address')['status'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=False)['status']
df['failed_moving_avg_10days_ip'] = df.groupby('ip_address')['status'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=False)['status']

# Display the result
print(df.head(10))


ValueError: Cannot set a DataFrame with multiple columns to the single column failed_moving_avg_5min

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Select features for the Isolation Forest model
features = ['failed_moving_avg_5min', 'failed_moving_avg_10days', 'failed_moving_avg_5min_ip', 'failed_moving_avg_10days_ip']
X = df[features]

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.1)  # You can adjust the contamination parameter

# Fit the model
df['anomaly'] = iso_forest.fit_predict(X)

# The anomaly column will have values of 1 for normal data points and -1 for anomalies
# Convert -1 to 'Anomaly' and 1 to 'Normal'
df['anomaly'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Also give anomaly score between 0 and 1
df['anomaly_score'] = iso_forest.decision_function(X)

# Display the anomalies with score greater than 0.5
print(df[df['anomaly_score'] > 0.5])

                    user_id   ip_address  failed_moving_avg_5min_ip  \
timestamp                                                             
2024-11-22 00:00:00   user1  192.168.1.1                   1.000000   
2024-11-22 00:01:00   user1  192.168.1.1                   1.000000   
2024-11-22 00:02:00   user2  192.168.1.2                   1.000000   
2024-11-22 00:03:00   user1  192.168.1.1                   0.666667   
2024-11-22 00:04:00   user3  192.168.1.3                   1.000000   
2024-11-22 00:05:00   user2  192.168.1.2                   0.500000   
2024-11-22 00:06:00   user1  192.168.1.1                   0.750000   
2024-11-22 00:07:00   user3  192.168.1.3                   0.500000   
2024-11-22 00:08:00   user4  192.168.1.4                   0.000000   
2024-11-22 00:09:00   user1  192.168.1.1                   0.800000   
2024-11-22 00:10:00   user2  192.168.1.2                   0.666667   
2024-11-22 00:11:00   user1  192.168.1.1                   0.800000   
2024-1

In [19]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Sample data (including new features)
data = {
    'timestamp': pd.date_range(start='2024-11-22', periods=20, freq='T'),
    'user_id': ['user1', 'user1', 'user2', 'user1', 'user3', 'user2', 'user1', 'user3', 'user4', 'user1', 'user2', 'user1', 'user3', 'user4', 'user2', 'user1', 'user3', 'user4', 'user2', 'user3'],
    'ip_address': ['192.168.1.1', '192.168.1.1', '192.168.1.2', '192.168.1.1', '192.168.1.3', '192.168.1.2', '192.168.1.1', '192.168.1.3', '192.168.1.4', '192.168.1.1', '192.168.1.2', '192.168.1.1', '192.168.1.3', '192.168.1.4', '192.168.1.2', '192.168.1.1', '192.168.1.3', '192.168.1.4', '192.168.1.2', '192.168.1.3'],
    'failed': [1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1],
    'device_type': ['Desktop', 'Mobile', 'Desktop', 'Mobile', 'Desktop', 'Mobile', 'Desktop', 'Desktop', 'Mobile', 'Desktop', 'Desktop', 'Mobile', 'Desktop', 'Mobile', 'Desktop', 'Mobile', 'Desktop', 'Mobile', 'Desktop', 'Mobile'],
    'OS': ['Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux', 'Windows', 'Linux'],
    'browser': ['Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome', 'Firefox', 'Chrome']
}

# Create DataFrame
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter out rows older than 10 days
df = df[df['timestamp'] > pd.Timestamp.now() - pd.DateOffset(days=10)]

# Set timestamp as index for time-based operations
df.set_index('timestamp', inplace=True)

# Create rolling mean for failed logins based on time intervals
df['failed_moving_avg_5min'] = df.groupby('user_id')['failed'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
df['failed_moving_avg_10days'] = df.groupby('user_id')['failed'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

df['failed_moving_avg_5min_ip'] = df.groupby('ip_address')['failed'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
df['failed_moving_avg_10days_ip'] = df.groupby('ip_address')['failed'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

# One-Hot Encoding for categorical columns (device_type, OS, browser)
df_encoded = pd.get_dummies(df[['device_type', 'OS', 'browser']], drop_first=True)

# Combine the encoded features with the existing numerical features
features = [
    'failed_moving_avg_5min', 'failed_moving_avg_10days', 'failed_moving_avg_5min_ip', 'failed_moving_avg_10days_ip'
]

# Include categorical features directly in the features list
X = pd.concat([df[features], df_encoded], axis=1)

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.1)  # You can adjust the contamination parameter

# Fit the model
df['anomaly'] = iso_forest.fit_predict(X)

# The anomaly column will have values of 1 for normal data points and -1 for anomalies
# Convert -1 to 'Anomaly' and 1 to 'Normal'
df['anomaly'] = df['anomaly'].map({1: 'Normal', -1: 'Anomaly'})

# Display the results
print(df[['user_id', 'ip_address', 'failed', 'failed_moving_avg_5min', 'failed_moving_avg_10days', 'anomaly', 'device_type', 'OS', 'browser']])


                    user_id   ip_address  failed  failed_moving_avg_5min  \
timestamp                                                                  
2024-11-22 00:00:00   user1  192.168.1.1       1                1.000000   
2024-11-22 00:01:00   user1  192.168.1.1       1                1.000000   
2024-11-22 00:02:00   user2  192.168.1.2       1                1.000000   
2024-11-22 00:03:00   user1  192.168.1.1       0                0.666667   
2024-11-22 00:04:00   user3  192.168.1.3       1                1.000000   
2024-11-22 00:05:00   user2  192.168.1.2       0                0.500000   
2024-11-22 00:06:00   user1  192.168.1.1       1                0.750000   
2024-11-22 00:07:00   user3  192.168.1.3       0                0.500000   
2024-11-22 00:08:00   user4  192.168.1.4       0                0.000000   
2024-11-22 00:09:00   user1  192.168.1.1       1                0.800000   
2024-11-22 00:10:00   user2  192.168.1.2       1                0.666667   
2024-11-22 0

  'timestamp': pd.date_range(start='2024-11-22', periods=20, freq='T'),
