In [None]:
# importing library
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# reading dataset
df = pd.read_csv(r'C:\Users\akank\Dropbox\My PC (LAPTOP-NQ9H8NTJ)\Documents\Sem 8\Project\datasets\brown\brown.csv')

df

# 1. Data Preparation

In [None]:
plt.figure(figsize=(10, 3))
df.groupby([df['label']]).size().sort_values(ascending=True).plot(kind='barh')
plt.title('Brown')
plt.xlabel('Number of texts')
plt.ylabel('Types of texts')
plt.show()

In [None]:
# dropping labels other than defined Fiction and Non-Fiction labels

index = df[(df.label == 'religion') | (df.label == 'lore') | (df.label == 'editorial') | 
(df.label == 'humor') | (df.label == 'belles_lettres')].index

df.drop(index, inplace = True)

df

In [None]:
# changing labels to fiction_genre and non_fiction_genre

df["label"] = np.where(df["label"] == ('fiction' or 'mystery' or 'romance' or 'adventure' or 'science_fiction'), 
                                        "fiction_genre", "non_fiction_genre")
                                        
df

In [None]:
# dropping paragraphs with sentences less than 5 or 6 (to deal with data imbalance)

sent_count = df.groupby(['filename', 'para_id'], as_index =  False).size()
print(sent_count)

for i in range(len(sent_count)):
    size = sent_count['size'].iloc[i]
    if((size<5) or (size>6)):
        doc = sent_count['filename'].iloc[i]
        para = sent_count['para_id'].iloc[i]
        index = df[(df['filename'] == doc) & (df['para_id'] == para)].index
        df.drop(index, inplace = True)
        
df

In [16]:
# storing labels of each paragraph
df_label = {k: f.groupby('para_id')['label'].apply(list).to_dict()
     for k, f in df.groupby('filename')}

class_label = []
for file_id, filename in df_label.items():
    for para_id, label in filename.items():
            class_label.append(label[0])
            
print(class_label)

pd.DataFrame(class_label, columns= ['label']).to_csv("labels.csv")

['non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction_genre', 'non_fiction

# 2. Data Pre-Processing

In [None]:
# removing punctuation
import string

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct

df['tokenized_text_wo_punct'] =  df['tokenized_text'].apply(lambda x: remove_punctuation(x))

df

In [None]:
# POS Tagging, Dependency and Constituency Parsing is done before Feature Extraction in next step

# 3. Feature Extraction

In [None]:
from feature_calculation.feature import Features

# calling Features class from feature_extraction.py
features = Features()

# creating sentence list for each paragraph
df_dict = {k: f.groupby('para_id')['tokenized_text_wo_punct'].apply(list).to_dict()
     for k, f in df.groupby('filename')}

# getting all features for each paragraph
feature_list = []
for file_id, filename in df_dict.items():
    for para_id, sent_list in filename.items():
            feature_list.append(features.get_all_features(sent_list))

# saving the features in a new dataframe
features = pd.DataFrame(feature_list)
features.to_csv("extracted_features.csv")

# 4. Supervised Learning

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

# calling the dataframe which will be used for training
X = pd.read_csv(r'C:\Users\akank\Dropbox\My PC (LAPTOP-NQ9H8NTJ)\Documents\Sem 8\Project\code\extracted_features.csv')
y = pd.read_csv(r'C:\Users\akank\Dropbox\My PC (LAPTOP-NQ9H8NTJ)\Documents\Sem 8\Project\code\labels.csv')
X = X.fillna(0)

# scaling data
scaler = StandardScaler()
scaled_data = scaler.fit(X)
scaled_data = scaler.transform(X)

scaled_data

array([[-1.73090413e+00, -5.15920127e-01,  8.45708430e-01, ...,
         2.44245602e+00,  2.65705024e+00,  1.21032518e+00],
       [-1.72861003e+00, -1.11890661e+00, -4.90638374e-01, ...,
        -1.17373879e-03,  8.66951340e-01,  8.91626602e-01],
       [-1.72631592e+00, -3.72351917e-01,  1.22138862e+00, ...,
        -3.29106737e-01, -3.47758628e-01, -2.64016387e-01],
       ...,
       [ 1.72631592e+00,  9.96331683e-01,  2.72347042e+00, ...,
        -1.64082389e-01, -2.51860472e-01,  5.14412878e-01],
       [ 1.72861003e+00, -1.15719146e+00, -1.22499505e+00, ...,
        -5.78758955e-01, -5.65127780e-01, -4.88563290e-01],
       [ 1.73090413e+00, -1.46825592e+00, -1.39092222e+00, ...,
        -5.68885703e-01, -4.75622835e-01, -1.49077587e+00]])

In [20]:
# defining classifier
clf = LogisticRegression(penalty = 'l1', solver= 'saga', max_iter = 1000)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=10, 
    scoring= "accuracy",
    n_jobs=2,
)
rfecv.fit(scaled_data, y.label)

# printing results
print(f"Optimal number of features: {rfecv.n_features_}")
print(f"Ranking of features (Selected features are given rank 1): {rfecv.ranking_}")

Optimal number of features: 4
Ranking of features (Selected features are given rank 1): [ 2  7  6  5 18 17  1 13  9 11 19  4  3 20 21 15 10  1  1 14  1  8 16 12]


In [21]:
# visualising the result
print(X.columns)

optimal_features = []
for i in range(len(X.columns)):
    if((rfecv.ranking_[i])==1):
        optimal_features.append(X.columns[i])
print(optimal_features)

Index(['Unnamed: 0', 'avg_sen_len', 'std_sen_len', 'TTR', 'Root TTR',
       'Log TTR', 'Maas TTR', 'Msstr', 'Ma TTR', 'HDD', 'MTLD', 'MTLD MA',
       'MTLD MA Bi', 'VocD', 'YulesK', 'adverb/adjective', 'adverb/noun',
       'adverb/pronoun', 'adjective/verb', 'adjective/pronoun', 'noun/verb',
       'noun/pronoun', 'verb/pronoun', 'content/function'],
      dtype='object')
['Maas TTR', 'adverb/pronoun', 'adjective/verb', 'noun/verb']
