# Login Pattern Analysis

In [22]:
# file = "../../logs/login.json"
file = "./data/logins_data.json"

In [38]:
import pandas as pd

# Sample data (your example)
data = pd.read_json(file)

# Create DataFrame
df = pd.DataFrame(data)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Filter out rows older than 10 days
df['timestamp'] = df['timestamp'].dt.tz_convert('UTC')
df = df[df['timestamp'] > pd.Timestamp.now(tz='UTC') - pd.DateOffset(hours=10)]

# Set timestamp as index for time-based operations
df.set_index('timestamp', inplace=True)

# Reset index to avoid issues with non-unique index when applying rolling operation
df_reset = df.reset_index()

# Compute rolling mean for 'status' within each 'user_id' group over 5 minutes
# Ensure the 'timestamp' column is in datetime format
df_reset['timestamp'] = pd.to_datetime(df_reset['timestamp'])

# Sort by 'user_id' and 'timestamp' to ensure proper ordering
df_reset = df_reset.sort_values(by=['user_id', 'timestamp'])

# Now calculate the rolling mean over 5 minutes
df_reset['failed_moving_avg_5min'] = df_reset.groupby('user_id')['status'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

# # Compute rolling mean for 'status' within each 'ip_address' group over 5 minutes
# df_reset['failed_moving_avg_5min_ip'] = df_reset.groupby('ip_address')['status'].rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)

# # Compute rolling mean for 'status' within each 'ip_address' group over 10 days
# df_reset['failed_moving_avg_10days_ip'] = df_reset.groupby('ip_address')['status'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

# # Reset the index back to the original timestamp index
df_reset = df_reset.sort_values(by=['user_id', 'ip_address'])

# Now calculate the rolling mean over 5 minutes
df_reset['failed_moving_avg_5min_ip'] = df_reset.groupby('ip_address')['status'].rolling(10, min_periods=1).mean().reset_index(level=0, drop=True)

df = df_reset.set_index('timestamp')

# Display the result
print(df.head(10), len(df))


                                  user_id  status      ip_address device_type  \
timestamp                                                                       
2024-11-29 12:06:38.414376+00:00        1       1  115.182.47.104      Mobile   
2024-11-29 12:10:18.414179+00:00        1       1  115.182.47.104     Desktop   
2024-11-29 12:10:16.414385+00:00        1       1   117.52.227.24      Tablet   
2024-11-29 12:10:32.414252+00:00        1       1   117.52.227.24      Mobile   
2024-11-29 12:37:24.414295+00:00        1       1   117.52.227.24     Desktop   
2024-11-29 12:56:42.414242+00:00        1       1   117.52.227.24     Desktop   
2024-11-29 11:59:44.414261+00:00        1       1    117.7.56.149     Desktop   
2024-11-29 12:25:05.414190+00:00        1       1    123.26.2.113     Desktop   
2024-11-29 12:09:28.414141+00:00        1       1  13.101.145.186      Tablet   
2024-11-29 12:08:30.414342+00:00        1       1   132.89.219.87      Tablet   

                           

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# One-Hot Encoding for categorical columns (device_type, OS, browser)
df_encoded = pd.get_dummies(df[['device_type', 'OS', 'browser']], drop_first=True)

# add the encoded columns to the original dataframe
df = pd.concat([df, df_encoded], axis=1)

# remove the original categorical columns
df_new = df.drop(['device_type', 'OS', 'browser'], axis=1)

# Combine the encoded features with the existing numerical features
features_user = [
    'failed_moving_avg_5min'
]

X = df_new[features_user]

X_ip = df_new[[
    'failed_moving_avg_5min_ip'
]]

# Initialize Isolation Forest model
iso_forest = IsolationForest(contamination=0.0001)  # You can adjust the contamination parameter

# Fit the model for user_id
iso_forest.fit(X)

# Predict the anomaly scores
df['anomaly_score'] = iso_forest.decision_function(X)

# Fit the model for ip_address
iso_forest.fit(X_ip)

# Predict the anomaly scores
df['anomaly_score_ip'] = iso_forest.decision_function(X_ip)

# get 10 rows with the highest anomaly scores
print(df.nlargest(10, 'anomaly_score')[['user_id', 'ip_address', 'failed_moving_avg_5min']], len(df))

# get 10 rows with the highest anomaly scores for ip_address
print(df.nlargest(10, 'anomaly_score_ip'), len(df))

# get min and max anomaly scores
print(df['anomaly_score'].min(), df['anomaly_score'].max())

# get min and max anomaly scores for ip_address
print(df['anomaly_score_ip'].min(), df['anomaly_score_ip'].max())

                                  user_id       ip_address  \
timestamp                                                    
2024-11-29 12:37:24.414295+00:00        1    117.52.227.24   
2024-11-29 12:56:42.414242+00:00        1    117.52.227.24   
2024-11-29 12:08:30.414342+00:00        1    132.89.219.87   
2024-11-29 12:40:23.414360+00:00        1   218.179.227.56   
2024-11-29 12:34:38.414210+00:00        1    230.33.88.165   
2024-11-29 12:47:47.414315+00:00        1   59.252.229.201   
2024-11-29 12:55:59.414324+00:00        1     7.56.197.162   
2024-11-29 12:04:38.420226+00:00        2  127.149.157.243   
2024-11-29 12:20:05.419982+00:00        2  127.149.157.243   
2024-11-29 12:37:05.420124+00:00        2  127.149.157.243   

                                  failed_moving_avg_5min  
timestamp                                                 
2024-11-29 12:37:24.414295+00:00                     0.6  
2024-11-29 12:56:42.414242+00:00                     0.6  
2024-11-29 12:08:30

In [46]:
# check if a random point is an anomaly
# Sample data for a new point
test_data = {
    "user_id": 100,
    "ip_address": "127.0.0.1",
    "status": 1,
    "device_type": "mobile",
    "OS": "iOS",
    "browser": "Chrome",
    "timestamp": pd.Timestamp.now(tz='UTC')
}

anomaly_score = iso_forest.decision_function([[df['failed_moving_avg_5min'].mean()]])

if anomaly_score >0.4:
    print("Anomaly detected", anomaly_score)
else:
    print("Not an anomaly")

Anomaly detected [0.42079922]


