pip install pandas
pip install scikit-learn
pip install xgboost
pip install tensorflow
pip install beautifulsoup

## Data Preprocessing

In [2]:
import pandas as pd
import csv 

# Adding Positive Words
def add_text(path):
    file = open(path,'r',encoding= "ISO-8859-1")
    read = file.readlines()
    list1 = []
    for i in read:
        s = i.split("|")
        if(len([i]) == 1 and len(s) == 1):
            i = i.strip()
            a = i.replace("\n","")
            a = i.replace("|","")
            list1.append(a)
        elif (len(s) > 1):
            a = i.replace("\n","")
            a = i.replace("|","")
            a = a.split(" ")
            for z in a:
                list1.append(z)
            for i in list1:
                if i == '\n':
                    list1.remove(i)
            for i in list1:
                if i == '':
                    list1.remove(i)
    file.close()
    return list1

positive_words = add_text('positive-words.txt')
negative_words = add_text('negative-words.txt')

stop_words1 = add_text('StopWords_Auditor.txt')
stop_words2 = add_text('StopWords_Currencies.txt')
stop_words3 = add_text('StopWords_DatesandNumbers.txt')
stop_words4 = add_text('StopWords_Generic.txt')
stop_words5 = add_text('StopWords_GenericLong.txt')
stop_words6 = add_text('StopWords_Geographic.txt')
stop_words7 = add_text('StopWords_Names.txt')

stop_words1.extend(stop_words2)
stop_words1.extend(stop_words3)
stop_words1.extend(stop_words4)
stop_words1.extend(stop_words5)
stop_words1.extend(stop_words6)
stop_words1.extend(stop_words7)

print(len(positive_words),len(negative_words),len(stop_words1))

for i in positive_words:
    csvfile = open("words.csv","a")
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow([i,0])
    csvfile.close()

for i in negative_words:
    csvfile = open("words.csv","a")
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow([i,1])
    csvfile.close()

for i in stop_words1:
    csvfile = open("words.csv","a")
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow([i,2])
    csvfile.close()

data_words = pd.read_csv('words.csv')

print(data_words.head())

2006 4783 14241
           a+  0
0      abound  0
1     abounds  0
2   abundance  0
3    abundant  0
4  accessable  0


## Modeling

In [4]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import StandardScaler
import xgboost as xg
from sklearn.metrics import accuracy_score, classification_report

# 0 - Positive
# 1 - Negative 
# 2 - Stop

data = pd.read_csv('words.csv')
data.columns = ["Text","Label"]

data["Text"] = data["Text"].apply(lambda x: str(x).lower())

vocab = 20000

tokenizer = Tokenizer(num_words=vocab, split=" ")
tokenizer.fit_on_texts(data["Text"].values)
X = tokenizer.texts_to_sequences(data["Text"].values)
X = pad_sequences(X,maxlen=1)

Y = data["Label"].values

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.20, random_state=42
)

print(X_train)

model = xg.XGBClassifier(
    objective='multi:softprob',num_class=3,n_estimators = 600,max_depth = 5)

model.fit(X_train, Y_train)

#Making predictions on the test set
predictions = model.predict(X_test)

#Calculating accuracy
accuracy = accuracy_score(Y_test, predictions)

print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(Y_test, predictions))

[[12873]
 [17798]
 [ 8037]
 ...
 [18083]
 [ 2253]
 [15221]]
Accuracy: 0.9808368996671422

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95      2007
           1       0.97      0.98      0.98      4775
           2       0.99      0.99      0.99     14248

    accuracy                           0.98     21030
   macro avg       0.97      0.97      0.97     21030
weighted avg       0.98      0.98      0.98     21030



## Url Text extraction

In [14]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
from functools import reduce

def remove_digits(lst):
    return [reduce(lambda x, y: x+y, filter(lambda x: not x.isdigit(), s), '') for s in lst]

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True
    

def get_main_text(u):
    main_t2 = []
    main_text_final = []
    main_t = ""
    
    response = requests.get(u)
    html_content = response.content
    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    texts = soup.findAll(string=True)
    visible_texts = filter(tag_visible, texts)  
    main_t = main_t + u" ".join(t.strip() for t in visible_texts)
    
    main_t = main_t.split(".")
    main_t = list(filter(None, main_t))
    for i in main_t:
        a = i.strip().replace(",","").replace("?","").replace(":","").replace("+","").replace("/","").replace("!","")
        main_t2.append(a)
    
    for i in main_t2:
        main = []
        main = i.split(" ")
        for i in main:
            main_text_final.append(i)
    
    main_text_final = list(filter(None, main_text_final))
    count = 0
    for i in main_text_final:
        if(len(i)<2):
            main_text_final.remove(i)
    
    main_text_final=remove_digits(main_text_final)
    main_text_final = list(filter(None, main_text_final))

    return main_text_final,main_t2


## Score Calculation

In [68]:
import numpy as np
import re

def syllable_count(word):
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith("e"):
        count -= 1
    if count == 0:
        count += 1
    return count

def predictions(i):
    sample = {}
    sample.update({'lines':[i]})
    sample = pd.DataFrame(sample)
    
    x_d = tokenizer.texts_to_sequences(sample['lines'].values)
    x_d = pad_sequences(x_d,maxlen=1)


    predictions = model.predict(x_d)
    for i in predictions:
        if i == 0:
            predictions = 0
        elif i == 1:
            predictions = 1
        elif i == 2:
            predictions = 2
            
    return predictions



def get_score(main_text_final,main_t2):
    positive_score = 0
    negative_score = 0
    complex_word_count = 0
    word_count = []
    sum_char = 0
    avg_word_length = 0
    sum_syl = 0
    
    for i in main_text_final:
        sum_char += len(i)
        
    for i in main_text_final:
        sum_syl += syllable_count(i)
        
    for i in main_text_final:
        predict = predictions(i)
        if predict == 0:
            word_count.append(i)
            positive_score +=1
        elif predict == 1:
            word_count.append(i)
            negative_score -= 1
    for i in main_text_final:
        if(syllable_count(i) > 2):
            complex_word_count += 1
    
    for i in main_text_final:
        pronounRegex = re.compile(r'\b(I|we|my|ours|(?-i:us))\b',re.I)
        pronouns = pronounRegex.findall(i)
        
    syllable_coun = sum_syl/len(main_text_final)
    pronouns_num = len(pronouns)
    polarity_score = (positive_score - abs(negative_score))/((positive_score + abs(negative_score)) + 0.000001)
    subjectivity_score = (positive_score + abs(negative_score))/ ((len(word_count)) + 0.000001)
    avg_word_length = sum_char / len(main_text_final)
    percentage_complex_words = complex_word_count/len(main_text_final)
    sum_w = 0
    for i in main_t2:
        text_t = i.split(" ")
        sum_w += len(text_t)
    avg_words_per_sentence = sum_w/len(main_t2)
    fog = 0.4 * (avg_words_per_sentence+ percentage_complex_words)
    
    
    return [positive_score,negative_score,polarity_score,subjectivity_score,avg_words_per_sentence,percentage_complex_words,fog,avg_words_per_sentence,complex_word_count,len(word_count),syllable_coun,pronouns_num,avg_word_length]


## Output in form of CSV File

In [82]:
csvfile = open("Input.csv","r")
csvreader = csv.reader(csvfile)

csvout = open("Output.csv","a")
csvwriter = csv.writer(csvout)
csvwriter.writerow(['URL_ID','URL','POSITIVE SCORE','NEGATIVE SCORE','POLARITY SCORE','SUBJECTIVITY SCORE','AVG SENTENCE LENGTH','PERCENTAGE OF COMPLEX WORDS','FOG INDEX','AVERAGE NUMBER OF WORDS PER SENTENCE','COMPLEX WORD COUNT','WORD COUNT','SYLLABLE PER WORD','PERSONAL PRONOUN','AVG WORD LENGTH'])

names = []
link = []
for lines in csvreader:
    names.append(lines[0])
    link.append(lines[1])

names = names[1:]
link = link[1:]


for i in range(0,len(link)):
    print(i)
    x,y = get_main_text(link[i])
    csvwriter.writerow([names[i],link[i],get_score(x,y)[0],get_score(x,y)[1],get_score(x,y)[2],get_score(x,y)[3],get_score(x,y)[4],get_score(x,y)[5],get_score(x,y)[6],get_score(x,y)[7],get_score(x,y)[8],get_score(x,y)[9],get_score(x,y)[10],get_score(x,y)[11],get_score(x,y)[12]])

csvfile.close()
csvout.close()

data = pd.read_csv('Output.csv',on_bad_lines='skip')
data.head()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146


Unnamed: 0,URL_ID,URL,POSITIVE SCORE,NEGATIVE SCORE,POLARITY SCORE,SUBJECTIVITY SCORE,AVG SENTENCE LENGTH,PERCENTAGE OF COMPLEX WORDS,FOG INDEX,AVERAGE NUMBER OF WORDS PER SENTENCE,COMPLEX WORD COUNT,WORD COUNT,SYLLABLE PER WORD,PERSONAL PRONOUN,AVG WORD LENGTH
0,bctech2011,https://insights.blackcoffer.com/ml-and-ai-bas...,182,-58,0.516667,1.0,21.584112,0.36786,8.780789,21.584112,1243,240,2.20657,0,6.504291
1,bctech2012,https://insights.blackcoffer.com/streamlined-i...,66,-34,0.32,1.0,36.830769,0.349415,14.872074,36.830769,478,100,2.205409,0,6.590643
2,bctech2013,https://insights.blackcoffer.com/efficient-dat...,66,-58,0.064516,1.0,45.87037,0.316973,18.474938,45.87037,465,124,2.145194,0,6.426721
3,bctech2014,https://insights.blackcoffer.com/effective-man...,66,-47,0.168142,1.0,35.268657,0.340237,14.243557,35.268657,460,113,2.16642,0,6.467456
4,bctech2015,https://insights.blackcoffer.com/streamlined-t...,61,-42,0.184466,1.0,56.340909,0.325982,22.666756,56.340909,473,103,2.128877,0,6.274983
