In [None]:
# Task 1: Detect unusual trends in sales data using anomaly detection

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest

# Simulated sales data (daily sales over 100 days)
np.random.seed(42)
sales = np.random.normal(loc=200, scale=20, size=100)
sales[95:] = sales[95:] + 100  # add unusual spike at the end

df = pd.DataFrame({'day': range(1, 101), 'sales': sales})

# Detect anomalies
model = IsolationForest(contamination=0.05)
df['anomaly'] = model.fit_predict(df[['sales']])
df['anomaly'] = df['anomaly'].map({1: 0, -1: 1})

# Plot
plt.figure(figsize=(10, 4))
plt.plot(df['day'], df['sales'], label='Sales')
plt.scatter(df[df['anomaly'] == 1]['day'], df[df['anomaly'] == 1]['sales'], color='red', label='Anomaly')
plt.title("Sales Trend with Anomaly Detection")
plt.xlabel("Day")
plt.ylabel("Sales")
plt.legend()
plt.grid(True)
plt.show()
# Task 2: Use clustering to detect similar (duplicate-like) entries

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

# Sample data with slightly different versions of names
records = [
    "John Doe", "Jon Doe", "Jane Smith", "J. Smith",
    "Jake Peralta", "Jake Peraltah", "Amy Santiago", "Ami Santiago"
]

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(2, 4))
X = vectorizer.fit_transform(records)

# Cluster using DBSCAN
clustering = DBSCAN(eps=0.5, min_samples=2, metric='cosine')
labels = clustering.fit_predict(X)

# Output grouped records
df_dup = pd.DataFrame({'record': records, 'cluster': labels})
print(df_dup.sort_values(by='cluster'))
# Task 3: Use classification to validate if entries are clean or problematic

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Sample dataset with label (1 = issue, 0 = clean)
data = {
    'age': [25, 30, np.nan, 40, 1000],       # 1000 is outlier
    'income': [50000, 60000, 55000, None, 70000],
    'gender': ['M', 'F', 'F', 'M', 'Unknown'],
    'issue': [0, 0, 1, 1, 1]  # Label issues manually
}

df = pd.DataFrame(data)

# Preprocess
df['gender'] = df['gender'].map({'M': 0, 'F': 1, 'Unknown': 2})
df.fillna(df.mean(numeric_only=True), inplace=True)

X = df[['age', 'income', 'gender']]
y = df['issue']

# Train and evaluate classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))