### Feature Extraction using TF-IDF vectorization
- using TfidfVectorizer

In [None]:
pip show scikit-learn

In [6]:
import pandas as pd
# importing required modules
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# defining the path to the dataset
dataset_path = r'D:/JantaKoAwaj-FYP/jka-ml-model/dataset/preprocessed_data.pkl'

# reading the dataset saved in pickle format
df = pd.read_pickle(dataset_path)
print(df.head())


                  Brief Description of the grievance  \
0  Mero ghar ko aagan ma dhulo dherai aauchha. Dh...   
1  Mero galli ma hidda mero chhaya dherai lamo bh...   
2  वडा नं. १६ नवोदित स्कुल नजिकै द्याबु मार्गमा व...   
3  नदीमा माछाहरू धेरै छन्। यिनीहरूले पानी फोहोर ग...   
4   ३१ वडा पञ्चकुमारी मन्दिर निरकाे खालि जग्गामा ...   

                                        cleaned_text  \
0  mero ghar ko aagan ma dhulo dherai aauchha. dh...   
1  mero galli ma hidda mero chhaya dherai lamo bh...   
2  वडा नं. १६ नवोदित स्कुल नजिकै द्याबु मार्गमा व...   
3  नदीमा माछाहरू धेरै छन्। यिनीहरूले पानी फोहोर ग...   
4  ३१ वडा पञ्चकुमारी मन्दिर निरकाे खालि जग्गामा ल...   

                             stopword_removed_tokens        Label  
0  [ghar, aagan, dhulo, aauchha, ., dhulo, rokna,...  not_genuine  
1  [galli, hidda, chhaya, lamo, bhayo, ., chhaya,...  not_genuine  
2  [वडा, नं, ., १६, नवोदित, स्कुल, द्याबु, मार्गम...      genuine  
3  [नदीमा, माछाहरू, छन्।, यिनीहरूले, पानी, फोहोर,...  

In [11]:
# initializing the TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
# fitting the vectorizer
X_tfidf = vectorizer.fit_transform(df['cleaned_text'])

In [12]:
print("\nTfidfVectorizer Output")
print("Shape of TfidfVectorizer matrix (documents x features):", X_tfidf.shape)
print("Type of matrix:", type(X_tfidf)) # Also a sparse matrix


TfidfVectorizer Output
Shape of TfidfVectorizer matrix (documents x features): (9874, 10000)
Type of matrix: <class 'scipy.sparse._csr.csr_matrix'>


In [13]:
import scipy
import joblib
# Save TF-IDF matrix (as .npz since it's sparse)
#TF-IDF matrices are huge but mostly filled with zeros and few non-zeros.
#.npz format stores it as a sparse matrix, saving memory and disk space. (Instead of storing all elements (including zeros), save space by storing only the non-zero values)
scipy.sparse.save_npz(r'D:/JantaKoAwaj-FYP/jka-ml-model/dataset/features/tfidf_features.npz', X_tfidf)

# Save the vectorizer
joblib.dump(vectorizer, r'D:/JantaKoAwaj-FYP/jka-ml-model/dataset/features/tfidf_vectorizer.pkl')

# Mapping the lables to binary value (0 and 1)
# 0 for not_genuine and 1 for genuine
df['labeled'] = df['Label'].map({'not_genuine':0, 'genuine':1})
# Save the labeled DataFrame
df['labeled'].to_csv(r'D:/JantaKoAwaj-FYP/jka-ml-model/dataset/features/labeled_data.csv', index=False)

print("Saved the TF-IDF features and vectorizer successfully.")


Saved the TF-IDF features and vectorizer successfully.
