## ***NLP EXPERIMENT NO - 7***

## ***AIM - Perform Chunking by Analyzing the Importance of Selecting Proper Features for Training a Model & Size of Training***

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

np.random.seed(0)

n_rows = 100
n_cols = 10

df = pd.DataFrame(np.random.rand(n_rows, n_cols), columns=[f'Feature_{i}' for i in range(n_cols)])
df['Target'] = np.random.randint(0, 2, n_rows)

feature_importances = mutual_info_classif(df.drop('Target', axis=1), df['Target'])
feature_importances = sorted(zip(df.columns[:-1], feature_importances), key=lambda x: x[1], reverse=True)
k = 5
top_features = [feature[0] for feature in feature_importances[:k]]

chunks = []
for i in range(0, len(df), k):
    chunk = df[top_features + ['Target']].iloc[i:i+k]
    chunks.append(chunk)

chunk_sizes = [len(chunk) for chunk in chunks]
print("Chunk sizes:", chunk_sizes)

for i, chunk in enumerate(chunks):
    X_train, X_test, y_train, y_test = train_test_split(chunk.drop('Target', axis=1), chunk['Target'], test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Chunk {i+1}: Accuracy = {accuracy:.3f}")

Chunk sizes: [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]
Chunk 1: Accuracy = 0.000
Chunk 2: Accuracy = 1.000
Chunk 3: Accuracy = 1.000
Chunk 4: Accuracy = 1.000
Chunk 5: Accuracy = 0.000
Chunk 6: Accuracy = 0.000
Chunk 7: Accuracy = 0.000
Chunk 8: Accuracy = 0.000
Chunk 9: Accuracy = 0.000
Chunk 10: Accuracy = 1.000
Chunk 11: Accuracy = 1.000
Chunk 12: Accuracy = 0.000
Chunk 13: Accuracy = 0.000
Chunk 14: Accuracy = 0.000
Chunk 15: Accuracy = 0.000
Chunk 16: Accuracy = 1.000
Chunk 17: Accuracy = 1.000
Chunk 18: Accuracy = 0.000
Chunk 19: Accuracy = 1.000
Chunk 20: Accuracy = 1.000
