In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/sample_data/Customer_Churn.csv")

In [None]:
df.info();

In [None]:
print(df.isnull().sum())


In [None]:
print(df['TotalCharges'].dtype)

In [None]:
def assign_tenure_group(tenure):
    if tenure <= 12:
        return 'Low'
    elif 13 <= tenure <= 36:
        return 'Medium'
    else:
        return 'High'

df['TenureGroup'] = df['tenure'].apply(assign_tenure_group)
print(df['TenureGroup'].value_counts())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
sns.set(style='whitegrid')
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='Contract', hue='Churn', palette='Set2')
plt.title('Churn by Contract Type')
plt.xlabel('Contract Type')
plt.ylabel('Customer Count')
plt.legend(title='Churn')
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(categorical_cols)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
df = df.drop(['customerID'], axis=1)


In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

In [None]:
df['TenureGroup'] = pd.cut(df['tenure'],
                           bins=[0, 12, 36, df['tenure'].max()],
                           labels=['Low', 'Medium', 'High'],
                           include_lowest=True)

In [None]:
binary_cols = ['gender', 'Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn']
le = LabelEncoder()
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

In [None]:
multi_cat_cols = ['MultipleLines', 'InternetService', 'OnlineSecurity',
                  'OnlineBackup', 'DeviceProtection', 'TechSupport',
                  'StreamingTV', 'StreamingMovies', 'Contract',
                  'PaymentMethod', 'TenureGroup']

df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

In [None]:
scaler = MinMaxScaler()
df[['MonthlyCharges', 'TotalCharges']] = scaler.fit_transform(df[['MonthlyCharges', 'TotalCharges']])


In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    print(f" {name}")
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall   :", recall_score(y_test, y_pred))
    print("F1 Score :", f1_score(y_test, y_pred))
    print("-" * 40)

evaluate_model("Logistic Regression", log_model)
evaluate_model("Decision Tree", dt_model)
evaluate_model("Random Forest", rf_model)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f" {name} Evaluation")
    print("Accuracy :", round(accuracy_score(y_test, y_pred), 4))
    print("Precision:", round(precision_score(y_test, y_pred), 4))
    print("Recall   :", round(recall_score(y_test, y_pred), 4))
    print("F1 Score :", round(f1_score(y_test, y_pred), 4))
    print("-" * 40)

In [None]:
evaluate_model("Logistic Regression", log_model, X_test, y_test)
evaluate_model("Decision Tree", dt_model, X_test, y_test)
evaluate_model("Random Forest", rf_model, X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
param_grid_log = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

param_grid_rf = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


log_model = LogisticRegression(max_iter=1000)
grid_log = GridSearchCV(log_model, param_grid_log, cv=5, scoring='accuracy')
grid_log.fit(X_train, y_train)


dt_model = DecisionTreeClassifier()
grid_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='accuracy')
grid_dt.fit(X_train, y_train)

rf_model = RandomForestClassifier()
grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_rf.fit(X_train, y_train)


In [None]:
print("Best parameters for Logistic Regression:", grid_log.best_params_)
print("Best accuracy for Logistic Regression:", grid_log.best_score_)

print("Best parameters for Decision Tree:", grid_dt.best_params_)
print("Best accuracy for Decision Tree:", grid_dt.best_score_)

print("Best parameters for Random Forest:", grid_rf.best_params_)
print("Best accuracy for Random Forest:", grid_rf.best_score_)


In [None]:
dt_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': grid_dt.best_estimator_.feature_importances_
}).sort_values(by='Importance', ascending=False)
rf_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': grid_rf.best_estimator_.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Important Features from Decision Tree:\n", dt_importances.head(10))
print("Important Features from Random Forest:\n", rf_importances.head(10))


In [None]:
df = pd.read_csv("/content/sample_data/Customer_Support_Tweets.csv")

In [None]:
df.info();

In [None]:
print(df.columns)

In [None]:
inbound_df = df[df['inbound'] == True].copy()
outbound_df = df[df['inbound'] == False].copy()


In [None]:
merged_df = pd.merge(
    inbound_df,
    outbound_df[['in_response_to_tweet_id', 'created_at']],
    left_on='tweet_id',
    right_on='in_response_to_tweet_id',
    how='left',
    suffixes=('_inbound', '_response')
)


In [None]:
merged_df['created_at_inbound'] = pd.to_datetime(merged_df['created_at_inbound'])
merged_df['created_at_response'] = pd.to_datetime(merged_df['created_at_response'])
merged_df['response_time_minutes'] = (merged_df['created_at_response'] - merged_df['created_at_inbound']).dt.total_seconds() / 60


In [None]:
merged_df['urgency'] = merged_df['response_time_minutes'].apply(
    lambda x: 'urgent' if pd.notnull(x) and x < 60 else 'non-urgent'
)


In [None]:
import re
import string

def preprocess_tweet(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()

    return text


In [None]:
merged_df['clean_text'] = merged_df['text'].apply(preprocess_tweet)

In [None]:
pip install transformers datasets scikit-learn torch



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
merged_df['label'] = merged_df['urgency'].map({'non-urgent': 0, 'urgent': 1})
train_texts, val_texts, train_labels, val_labels = train_test_split(
    merged_df['clean_text'].tolist(),
    merged_df['label'].tolist(),
    test_size=0.2,
    random_state=42
)


In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


In [None]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {'labels': torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)


In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


In [None]:
import transformers
print(transformers.__version__)


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='weighted')
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [None]:
print(results)

In [None]:
{'eval_loss': 0.32, 'eval_runtime': 5.2}

In [None]:
!pip install streamlit


In [None]:
pip install streamlit transformers torch seaborn matplotlib


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, pipeline


In [None]:

import torch
import numpy as np

@st.cache_resource
def load_model():
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    model = DistilBertForSequenceClassification.from_pretrained("your_model_dir")  # Replace with your fine-tuned model directory
    return tokenizer, model

tokenizer, model = load_model()
model.eval()
st.title("Tweet Urgency Classification")
st.markdown("Classifies tweets as **urgent** or **non-urgent** based on response time prediction.")

tweet = st.text_area("Enter Tweet Text:", "")

if st.button("Predict Urgency"):
    if tweet.strip() == "":
        st.warning("Please enter a tweet.")
    else:
        inputs = tokenizer(tweet, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1).numpy()[0]

        label = "Urgent" if np.argmax(probs) == 1 else "Non-Urgent"
        confidence = round(float(np.max(probs)) * 100, 2)

        st.subheader("Prediction:")
        st.success(f"**{label}** (Confidence: {confidence}%)")
        if st.checkbox("Show attention weights"):
            with torch.no_grad():
                outputs_attn = model(**inputs, output_attentions=True)
                attentions = outputs_attn.attentions
                st.write("Attention layers output:")
                for i, layer_attn in enumerate(attentions):
                    st.write(f"Layer {i + 1} attention shape: {layer_attn.shape}")
if st.checkbox("Show model summary"):
    st.write(model)
