In [1]:
%%time

# This script was used to generate Important Features for all 12 optimized clusters each for Sampel 1 & 2

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
  
# Load lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')+['climate','change','amp','deleted','climate change','don','amp nbsp','don know','people','cunt','cunt cunt']

# Load the cleaned data from "PreProcessedData.csv" file
df = pd.read_csv("PreProcessedData.csv")

Wall time: 51.3 s


In [2]:
%%time

# Create a funtion to preprocess the nltk tokenized dataframe
def word_tokenizer(text):
    tokenized_text = text
    tokenized_text_proc = [token.lower() for token in tokenized_text 
                           if  (not token.lower() in stopwords) # Remove stopwords 
                           and (not len(token) <= 2)]     # Remove short tokens
    tokenized_text_proc = [lemmatizer.lemmatize(token) for token in tokenized_text_proc]
    return tokenized_text_proc


# Create the function to generate Important Features for each sample
def getImportantFeatures(sampNo=1):   # sampNo refers to "Sample No." and;
       
    #Load the final lables and merge with preprocessed data
    sample = pd.read_csv(f"FinalLabelsSample{sampNo}.csv")
    
    # Check the length of dataframe. It should be 150,000 in this case
    print(f"\nSample {sampNo} size: {len(sample)}\n")
    print("Cluster sizes:\n",sample['labels'].value_counts())    
    
    # Create an excel file to save important features for each cluster
    writer = pd.ExcelWriter(f"ImportantFeaturesSample{sampNo}.xlsx", engine='xlsxwriter')
    
    # Tokenize the clean text and create cv_matrix
    temp = df.merge(sample, how='inner', left_on='id', right_on='ids')
    temp['date'] = pd.to_datetime(temp['date'])
    temp.set_index('date', inplace=True)
    temp['tokens'] = temp['clean_text'].apply(word_tokenize)
    temp['tokens'] = temp['tokens'].apply(word_tokenizer)

    cv = TfidfVectorizer(lowercase=False, tokenizer=lambda x:x, max_features=500)
    cv_matrix = cv.fit_transform(temp['tokens'])
    
    
    # Run the loop for each cluster and save the generated Important Features in excel file
    for i in range(1,13):
        print("\n=========================================")
        temp1 = temp.copy()        

        
        # Script to create arrays to map binary cluster
        start, stop, restart_point = i, i-1, 12
        if stop < start:
            stop += restart_point
        clusterNo = []
        for i in range(start-1, stop):
            val = i % restart_point + 1
            clusterNo.append(val)
        
        
        # Map the binary cluster
        temp1[f"Binary Cluster {clusterNo[0]}"] = temp1['labels'].map({clusterNo[0]:1, clusterNo[1]:0, clusterNo[2]:0, clusterNo[3]:0, clusterNo[4]:0, clusterNo[5]:0, clusterNo[6]:0, clusterNo[7]:0, clusterNo[8]:0, clusterNo[9]:0, clusterNo[10]:0, clusterNo[11]:0})
        print()
        print(f"Training binary classifier for Cluster {clusterNo[0]}\n",temp1[f"Binary Cluster {clusterNo[0]}"].value_counts())

        # Train the Supervised learning model
        clf = RandomForestClassifier(random_state=47)
        clf.fit(cv_matrix, temp1[f"Binary Cluster {clusterNo[0]}"].values)

        # Sort the features w.r.t. their importance weight
        sorted_feature_weight_idxes = np.argsort(clf.feature_importances_)[::-1]
        most_important_features = np.take_along_axis(np.array(cv.get_feature_names()), sorted_feature_weight_idxes, axis=0)
        most_important_weights = np.take_along_axis(np.array(clf.feature_importances_), sorted_feature_weight_idxes, axis=0)
        print(f"\nSample {sampNo}, Cluster {clusterNo[0]} - Important Features with weights\n",list(zip(most_important_features, most_important_weights))[:10])
        
        # Write the featurs dataframe to local disk
        featuredf = pd.DataFrame(list(zip(most_important_features, most_important_weights))[:50], columns=['feature','weight'])
        featuredf.to_excel(writer, sheet_name=f'Cluster{clusterNo[0]}', index=True)
        
    # Save the excel file
    writer.save()
    

# Run the function to generate important features list for Sample 1    
getImportantFeatures()

# Run the function for Sample 2
getImportantFeatures(sampNo=2)
        


Sample 1 size: 150000

Cluster sizes:
 1     27980
9     21779
11    20154
12    13809
4     13648
6     10303
8      8970
3      6980
7      6820
2      6661
5      6543
10     6353
Name: labels, dtype: int64


Training binary classifier for Cluster 1
 0    122020
1     27980
Name: Binary Cluster 1, dtype: int64

Sample 1, Cluster 1 - Important Features with weights
 [('energy', 0.11637725244070947), ('solar', 0.10032690704042617), ('oil', 0.09531111682657499), ('nuclear', 0.06062058676366141), ('power', 0.038917217599615805), ('renewable', 0.029919166746907342), ('electric', 0.022854083086580913), ('wind', 0.02170593486322998), ('gas', 0.017266946449892698), ('fuel', 0.016178770051855938)]


Training binary classifier for Cluster 2
 0    143339
1      6661
Name: Binary Cluster 2, dtype: int64

Sample 1, Cluster 2 - Important Features with weights
 [('carbon', 0.14273413448219205), ('coal', 0.12032953860703305), ('emission', 0.10763324742861592), ('pollution', 0.06648795249847991), (

Training binary classifier for Cluster 4
 0    135813
1     14187
Name: Binary Cluster 4, dtype: int64

Sample 2, Cluster 4 - Important Features with weights
 [('warming', 0.06811621697950289), ('global', 0.04029636646972011), ('science', 0.02741685737735144), ('scientist', 0.02663788605969599), ('new', 0.00949005071856415), ('action', 0.009221311456619944), ('world', 0.008087400828072447), ('crisis', 0.007399237765577591), ('report', 0.00690277529222727), ('say', 0.006611436096686527)]


Training binary classifier for Cluster 5
 0    143406
1      6594
Name: Binary Cluster 5, dtype: int64

Sample 2, Cluster 5 - Important Features with weights
 [('ice', 0.13666477934078192), ('arctic', 0.07285115090234769), ('temperature', 0.06833276982348496), ('warming', 0.029703835214426846), ('record', 0.02766560388992866), ('sea', 0.024231833691792125), ('global', 0.019879525890534492), ('year', 0.013527468410024819), ('level', 0.012984423591186654), ('winter', 0.011883213051761517)]


Training bi