##  Data Loading and Exploration


* Import Libraries 



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_curve,
    auc,
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import warnings

warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'pandas'

* Load client_hostname.csv

In [None]:
import os
print(os.getcwd())
os.chdir('/Users/abhishekkandel/Desktop/LOG Analysis/')
print(os.getcwd())
client_df = pd.read_csv('client_hostname.csv', sep='\t')
client_df.head()

In [None]:
# Load access.log
with open('access.log', 'r') as file:
    log_lines = file.readlines()




In [None]:
print(client_df.head())
print(client_df.info())

In [None]:
client_df.info()


## Step 2: Data Cleaning and Preprocessing

### Parse and Clean access.log

We will extract structured information from the log entries:
- IP Address
- Timestamp
- Request Method
- Requested URL
- HTTP Version
- Response Code
- User Agent
```

In [None]:
def parse_log_line(line):
    # Regular expression pattern for Apache log format
    pattern = (
        r'(\S+) (\S+) (\S+) \[(.*?)\] '
        r'"(\S+) (.*?) (\S+)" (\d{3}) (\d+) "(.*?)" "(.*?)" "(.*?)"'
    )
    match = re.match(pattern, line)
    if match:
        return {
            'ip': match.group(1),
            'ident': match.group(2),
            'authuser': match.group(3),
            'date': match.group(4),
            'method': match.group(5),
            'request': match.group(6),
            'protocol': match.group(7),
            'status': int(match.group(8)),
            'bytes': int(match.group(9)),
            'referrer': match.group(10),
            'user_agent': match.group(11),
            'unknown': match.group(12),
        }
    else:
        return None

### Parse Log Data

We will parse the log data to extract structured information from each log entry. The extracted information will include:
- IP Address
- Ident
- Authuser
- Date
- Method
- Request
- Protocol
- Status
- Bytes
- Referrer
- User Agent
- Unknown
```

In [None]:
parsed_logs = []

for line in log_lines:
    parsed_line = parse_log_line(line)
    if parsed_line:
        parsed_logs.append(parsed_line)

log_df = pd.DataFrame(parsed_logs)

In [None]:
# Handle Missing or Malformed Entries
log_df.dropna(inplace=True)

In [None]:
# Convert Date to Datetime
log_df['datetime'] = log_df['date'].apply(
    lambda x: datetime.strptime(x.split()[0], '%d/%b/%Y:%H:%M:%S')
)


 * Extracts product IDs and actions from a list of URLs.
 

In [None]:
def extract_product_id(url):
    match = re.search(r'/product/(\d+)', url)
    if match:
        return match.group(1)
    else:
        return None

def extract_action(url):
    if '/product/' in url:
        return 'view'
    elif '/add-to-cart/' in url:
        return 'add_to_cart'
    elif '/purchase/' in url:
        return 'purchase'
    else:
        return 'other'

In [None]:
# Applying the functions:
log_df['product_id'] = log_df['request'].apply(extract_product_id)
log_df['action'] = log_df['request'].apply(extract_action)


<h2> Combine Datasets
</h2>

In [None]:
print("Columns in log_df:", log_df.columns)
print("Columns in client_df:", client_df.columns)
# Check the File Delimiter
client_df = pd.read_csv('client_hostname.csv', delimiter=',')
# Verify Column Names After Parsing
print(client_df.columns)
# Rename client to ip
client_df.rename(columns={'client': 'ip'}, inplace=True)
# Perform the Merge
merged_df = pd.merge(log_df, client_df, on='ip', how='left')

print(merged_df.head())



* Handle Missing/Inconsistent Data

In [None]:
merged_df.fillna({'hostname': 'unknown'}, inplace=True)
merged_df.drop_duplicates(inplace=True)

print(merged_df.head())

## Feature Engineering

In [None]:
# Assuming a session timeout of 30 minutes:
merged_df.sort_values(['ip', 'datetime'], inplace=True)

merged_df['session_id'] = (
    merged_df.groupby('ip')['datetime']
    .diff().gt(pd.Timedelta(minutes=30)).cumsum()
)

print(merged_df.head())

<h3>
Calculate Session Duration
</h3>

In [None]:
session_duration = merged_df.groupby(['ip', 'session_id'])['datetime'].agg(
    ['min', 'max']
)
session_duration['session_duration'] = (
    session_duration['max'] - session_duration['min']
).dt.total_seconds()

print(session_duration.head())

<h4>Merge back to merged_df:

In [None]:
merged_df = merged_df.merge(
    session_duration['session_duration'], on=['ip', 'session_id'], how='left'
)

print(merged_df.head())

# Number of Views per Product per Session
product_views = merged_df[merged_df['action'] == 'view'].groupby(
    ['ip', 'session_id', 'product_id']
).size().reset_index(name='view_count')

print(product_views.head())

# Merge back:
merged_df = merged_df.merge(
    product_views, on=['ip', 'session_id', 'product_id'], how='left'
)


<h3>Cart Abandonment Rate
</h3>

<p>First, identify sessions where items were added to the cart but not purchased.

In [None]:
def is_abandoned(session_actions):
    return 'add_to_cart' in session_actions and 'purchase' not in session_actions

abandonment = (
    merged_df.groupby(['ip', 'session_id'])['action']
    .apply(lambda x: is_abandoned(set(x)))
    .reset_index(name='abandoned')
)

In [None]:
print(abandonment.head())
print(abandonment['abandoned'].value_counts())
print(abandonment['abandoned'].mean())
print(abandonment['abandoned'].sum())

In [None]:
# Merge back:
merged_df = merged_df.merge(
    abandonment, on=['ip', 'session_id'], how='left'
)

## Categorical Data Encoding


In [None]:
# For simplicity, we'll use Label Encoding here
le_action = LabelEncoder()
merged_df['action_encoded'] = le_action.fit_transform(merged_df['action'])

le_product = LabelEncoder()
merged_df['product_id_encoded'] = merged_df['product_id'].astype(str)
merged_df['product_id_encoded'] = le_product.fit_transform(
    merged_df['product_id_encoded']
)


In [None]:
print(merged_df.head())

# Session Level Aggregations
session_features = merged_df.groupby(['ip', 'session_id']).agg({
    'session_duration': 'first',
    'view_count': 'sum',
    'abandoned': 'first',
    'action_encoded': list,
    'product_id_encoded': list,
})
session_features.reset_index(inplace=True)

## Data Visualization

<h3> User Behavior Trends


In [None]:
actions_over_time = merged_df.groupby(
    merged_df['datetime'].dt.date
)['action'].value_counts().unstack().fillna(0)

actions_over_time.plot(kind='line', figsize=(12, 6))
plt.title('User Actions Over Time')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend(title='Action')
plt.show() 

print(actions_over_time)





In [None]:
# Most Viewed Products
most_viewed_products = merged_df[merged_df['action'] == 'view']['product_id'].value_counts().head(10)

sns.barplot(x=most_viewed_products.values, y=most_viewed_products.index)
plt.title('Top 10 Most Viewed Products')
plt.xlabel('View Count')
plt.ylabel('Product ID')
plt.show()

print(most_viewed_products)

In [None]:
# Abandomnent Trends

abandonment_rate = abandonment['abandoned'].value_counts(normalize=True) * 100

plt.pie(
    abandonment_rate,
    labels=abandonment_rate.index.map({False: 'Not Abandoned', True: 'Abandoned'}),
    autopct='%1.1f%%',
    startangle=140
)
plt.title('Cart Abandonment Rate')
plt.show()

print(abandonment_rate)

<B>Heatmap of Feature Correlations</b>

In [None]:
corr = merged_df[['session_duration', 'view_count', 'abandoned']].corr()

sns.heatmap(corr, annot=True)
plt.title('Feature Correlation Heatmap')
plt.show()
print(corr)


## Model  Building 

<b> Define Target Variable</b>

We will predict whether a session will lead to a purchase (purchase action in the session).

Create target variable:

In [None]:
# Ensure 'purchase' is included in the labels
le_action.fit(merged_df['action'].tolist() + ['purchase'])

session_features['purchase'] = session_features['action_encoded'].apply(
    lambda x: 1 if le_action.transform(['purchase'])[0] in x else 0
)




In [None]:
# Split Data

X = session_features[['session_duration', 'view_count', 'abandoned']]
y = session_features['purchase']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
# Printing woth label 
print(X_train.head())
print(y_train.head())
print(X_test.head())
print(y_test.head())

# Graphical Representation
sns.pairplot(session_features, hue='purchase')
plt.show()


In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

print('Logistic Regression:')
print('Accuracy:', accuracy_score(y_test, y_pred_logreg))
print('Precision:', precision_score(y_test, y_pred_logreg))
print('Recall:', recall_score(y_test, y_pred_logreg))
print('F1 Score:', f1_score(y_test, y_pred_logreg))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_logreg))


# Graphical Representation
fpr, tpr, thresholds = roc_curve(y_test, y_pred_logreg)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


# SNS Graphical Representation
sns.heatmap(confusion_matrix(y_test, y_pred_logreg), annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.show()








In [None]:
# Random Forest

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print('Random Forest:')
print('Accuracy:', accuracy_score(y_test, y_pred_rf))
print('Precision:', precision_score(y_test, y_pred_rf))
print('Recall:', recall_score(y_test, y_pred_rf))
print('F1 Score:', f1_score(y_test, y_pred_rf))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_rf))

# Graphical Representation

fpr, tpr, thresholds = roc_curve(y_test, y_pred_rf)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


# Feature Importance
feature_importance = pd.Series(rf.feature_importances_, index=X_train.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title('Feature Importance')
plt.show()

print(feature_importance)

# SNS Pairplot

sns.pairplot(session_features, hue='purchase')
plt.show()





In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print('Gradient Boosting:')
print('Accuracy:', accuracy_score(y_test, y_pred_gb))
print('Precision:', precision_score(y_test, y_pred_gb))
print('Recall:', recall_score(y_test, y_pred_gb))
print('F1 Score:', f1_score(y_test, y_pred_gb))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred_gb))

# Graphical Representation
fpr, tpr, thresholds = roc_curve(y_test, y_pred_gb)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


# Feature Importance
feature_importance = pd.Series(gb.feature_importances_, index=X_train.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title('Feature Importance')
plt.show()

print(feature_importance)


# SNS Pairplot
sns.pairplot(session_features, hue='purchase')
plt.show()



## Model Evaluatoion 



In [None]:
# Performance Metrics

def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return acc, prec, rec, f1

In [None]:
# Logistic Regression Metrics
acc_logreg, prec_logreg, rec_logreg, f1_logreg = calculate_metrics(y_test, y_pred_logreg)
# Random Forest Metrics
acc_rf, prec_rf, rec_rf, f1_rf = calculate_metrics(y_test, y_pred_rf)
# Gradient Boosting Metrics
acc_gb, prec_gb, rec_gb, f1_gb = calculate_metrics(y_test, y_pred_gb)

In [None]:
metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gradient Boosting'],
    'Accuracy': [acc_logreg, acc_rf, acc_gb],
    'Precision': [prec_logreg, prec_rf, prec_gb],
    'Recall': [rec_logreg, rec_rf, rec_gb],
    'F1 Score': [f1_logreg, f1_rf, f1_gb],
})

print(metrics_df)

## Visualizing 

In [None]:
# Confusion Matrices

from sklearn.metrics import ConfusionMatrixDisplay

models = [logreg, rf, gb]
model_names = ['Logistic Regression', 'Random Forest', 'Gradient Boosting']

for model, name in zip(models, model_names):
    disp = ConfusionMatrixDisplay.from_estimator(
        model, X_test, y_test, display_labels=['No Purchase', 'Purchase'], cmap=plt.cm.Blues
    )
    disp.ax_.set_title(f'Confusion Matrix: {name}')
    plt.show()

In [None]:
# ROC Curves

from sklearn.metrics import roc_curve, roc_auc_score

plt.figure(figsize=(10, 8))

for model, name in zip(models, model_names):
    y_score = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_score)
    auc_score = roc_auc_score(y_test, y_score)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.2f})')

plt.plot([0, 1], [0, 1], 'k--')  # Random chance
plt.title('ROC Curves')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

Behaviour Analysis 

User Behavior Patterns

Identify patterns such as:<br>
Frequent Viewers: Users with high view counts but no purchasesFrequent <br>
Buyers: Users with frequent purchases.

In [None]:
session_features['user_type'] = session_features.apply(
    lambda x: 'Frequent Buyer' if x['purchase'] == 1 else 'Frequent Viewer', axis=1
)

sns.pairplot(session_features, hue='user_type')
plt.show()



In [None]:
# High Abandonment Products or Sessions
high_abandonment_sessions = session_features[session_features['abandoned'] == True]

print(high_abandonment_sessions.head())


In [None]:
# Feature Importance from Random Forest
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
feature_importances.sort_values().plot(kind='barh')
plt.title('Feature Importances from Random Forest')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

print(feature_importances)

