In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('anoma_data_set.csv')
print(df.head())
print(df.info())

In [None]:
#Understanding the dataset
# Check the shape of the dataset
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns.")

# Check for missing values
print("Missing Values in each column:")
print(df.isnull().sum())

# Get summary statistics
print("Summary Statistics:")
print(df.describe())

# Check the data types
print("Data Types:")
print(df.dtypes)

In [None]:
#Visualize data
#Distribution of target column
sns.countplot(x='y', data=df)
plt.title("Distribution of Target Variable")
plt.show()
#Correlation Between Features
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation Heatmap")
plt.show()
#Feature Distributing
sns.histplot(df['feature_column_name'], kde=True)
plt.title("Distribution of Feature")
plt.show()
#checking for duplicates
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

In [None]:
#Data Cleaning
print(df.isnull().sum())
df.drop(columns=['column_name'], inplace=True)
sns.boxplot(data=df, x='column_name')
plt.title("Boxplot of Column")
plt.show()
upper_limit = df['column_name'].quantile(0.85)
lower_limit = df['column_name'].quantile(0.05)
df['column_name'] = np.clip(df['column_name'], lower_limit, upper_limit)
df['date_column'] = pd.to_datetime(df['date_column'])
df['categorical_column'] = df['categorical_column'].astype('category')
df.drop_duplicates(inplace=True)
#Normalization
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['feature1', 'feature2']] = scaler.fit_transform(df[['feature1', 'feature2']])

In [None]:
#Feature Engineering
df['new_feature'] = df['feature1'] * df['feature2']
df['year'] = df['date_column'].dt.year
df['month'] = df['date_column'].dt.month
df['day_of_week'] = df['date_column'].dt.dayofweek
df['feature'] = np.log1p(df['feature'])
df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)
corr_matrix = df.corr()
high_corr = corr_matrix[corr_matrix > 0.9]
print(high_corr)
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X, y)

# Display feature importances
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.01)
X_new = selector.fit_transform(X)

In [None]:
# Training testing and selecting the model
from sklearn.model_selection import train_test_split

X = df.drop('y', axis=1)
y = df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
from sklearn.svm import SVC

model = SVC(random_state=42)
model.fit(X_train, y_train)
import xgboost as xgb

model = xgb.XGBClassifier(random_state=42)
model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

y_pred_proba = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {auc_score:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='grey')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

In [None]:
#HyperParameter Tuning and Model Deployment and ModelImprovement
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

param_grid = {
    'n_estimators': [60, 120, 240],
    'max_depth': [4, 5, 6, 8],
    'min_samples_split': [3, 6, 9],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

from sklearn.ensemble import VotingClassifier

ensemble_model = VotingClassifier(estimators=[('rf', model), ('svc', SVC()), ('xgb', xgb.XGBClassifier())], voting='hard')
ensemble_model.fit(X_train, y_train)
model = LogisticRegression(C=0.1, random_state=42, solver='liblinear', penalty='l2')
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy:.2f}")

y_test_pred = model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")

In [None]:
#Model Packaging
import joblib

# Save the model
joblib.dump(model, 'model.pkl')
# deployment_script.py
import joblib
import pandas as pd

model = joblib.load('model.pkl')

def predict_anomalies(data):
    # Preprocess the data (similar to training)
    data = pd.DataFrame(data)  # Assuming data is passed as a dictionary
    # Apply any necessary preprocessing here (scaling, encoding, etc.)
    return model.predict(data)

if _name_ == "_main_":
    sample_data = {'feature1': [value1], 'feature2': [value2]}  # Sample input data
    prediction = predict_anomalies(sample_data)
    print(f"Prediction: {prediction}")