In [7]:
import pandas as pd 
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [8]:
dataset = pd.read_csv("c:/website_classification.csv" , low_memory=False) 
df = pd.DataFrame(dataset)
display(df)
print (df['Category'].value_counts(ascending=True))
print("Dataset size: " ,df.shape)

Unnamed: 0.1,Unnamed: 0,website_url,cleaned_website_text,Category
0,0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel
...,...,...,...,...
1404,1387,http://www.electroshops.com/,electroshops home theater decor interiors seat...,Business/Corporate
1405,1388,http://www.cleanridge.com/,clean ridge soap company clean ridge soap comp...,Business/Corporate
1406,1389,http://www.creativepetgifts.com/,home page pet crafts exquisitely piece handcut...,Business/Corporate
1407,1390,http://www.htmarket.com/,home theater marketplace home theater seating ...,Business/Corporate


Forums                              16
Social Networking and Messaging     83
Law and Government                  83
Photography                         91
Food                                91
News                                92
Computers and Technology            93
Health and Fitness                  96
Games                               98
E-Commerce                         102
Streaming Services                 103
Sports                             103
Travel                             106
Business/Corporate                 109
Education                          110
Name: Category, dtype: int64
Dataset size:  (1409, 4)


Dropping duplicate coloumn and missing value rows

In [9]:
df.drop('Unnamed: 0',axis=1,inplace=True)

print (df['Category'].value_counts(ascending=True))

df.replace([np.inf, -np.inf], np.nan, inplace=True) #replace infinity values with NaN
df.dropna(inplace=True) #dropping rows with missing values  


Forums                              16
Social Networking and Messaging     83
Law and Government                  83
Photography                         91
Food                                91
News                                92
Computers and Technology            93
Health and Fitness                  96
Games                               98
E-Commerce                         102
Streaming Services                 103
Sports                             103
Travel                             106
Business/Corporate                 109
Education                          110
Name: Category, dtype: int64
Dataset size after removal :  (1376, 3)


Extract features from text

In [10]:
x=df['cleaned_website_text']      
y=df['Category']  

x=CountVectorizer().fit_transform(x.apply(lambda x: np.str_(x)))
   
x=TfidfTransformer().fit_transform(x) 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

Balancing dataset..

In [11]:
counter = Counter(y_train.shape) 
print('Before', counter) 
smtom = SMOTE() 
X_train_smtom, y_train_smtom = smtom.fit_resample (X_train, y_train)
counter = Counter(y_train_smtom) 
print('After', counter)
print("Dataset size after balance : " ,y_train_smtom.shape)

Before Counter({963: 1})
After Counter({'Law and Government': 77, 'Photography': 77, 'E-Commerce': 77, 'Travel': 77, 'Health and Fitness': 77, 'Forums': 77, 'Computers and Technology': 77, 'Streaming Services': 77, 'Food': 77, 'Games': 77, 'Social Networking and Messaging': 77, 'Sports': 77, 'News': 77, 'Education': 77, 'Business/Corporate': 77})
Dataset size after balance :  (1155,)


Hyperparameter tunning of naive bayes and applying the classifier 

In [17]:
NB_classifier = GaussianNB()
parameters={'var_smoothing': np.logspace(0,-9, num=100)}
grid=GridSearchCV(estimator = NB_classifier, param_grid =parameters, n_jobs=-1, verbose=-1)

# fitting the model for grid search
grid.fit(X_train_smtom.todense(),y_train_smtom)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)




[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 15.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 28.4min finished


{'var_smoothing': 0.001873817422860383}
GaussianNB(var_smoothing=0.001873817422860383)


In [25]:
classifier = GaussianNB(var_smoothing=0.001873817422860383)
classifier.fit(X_train_smtom.todense(),y_train_smtom)
y_pred=classifier.predict(X_test.todense())
print(classification_report(y_test, y_pred))

                                 precision    recall  f1-score   support

             Business/Corporate       0.76      0.63      0.69        41
       Computers and Technology       0.76      0.66      0.70        29
                     E-Commerce       0.79      0.81      0.80        32
                      Education       0.91      0.64      0.75        33
                           Food       0.82      0.86      0.84        21
                         Forums       0.09      0.12      0.11         8
                          Games       0.70      0.87      0.78        30
             Health and Fitness       0.88      0.73      0.80        30
             Law and Government       0.79      0.97      0.87        31
                           News       0.68      0.65      0.67        26
                    Photography       0.84      0.80      0.82        20
Social Networking and Messaging       0.65      0.88      0.75        17
                         Sports       0.83      0.

Hyperparameter tunning of SVM and applying the classifier 

In [26]:

 
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000],'gamma':[0.001, 0.01, 0.1, 1, 3, 5, 10, 20],
                'kernel': ['linear', 'rbf', 'poly']}
 
grid = GridSearchCV(SVC(), param_grid, cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")
 
# fitting the model for grid search
grid.fit(X_train_smtom,y_train_smtom)


# print best parameter after tuning
print(grid.best_params_)
 
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)

Fitting 4 folds for each of 120 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed: 11.2min finished


{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
SVC(C=10, gamma=0.001, kernel='linear')


In [27]:
svc=SVC(C=10, gamma=0.001, kernel='linear')
svc.fit(X_train_smtom,y_train_smtom)
y_pred=svc.predict(X_test)

print(classification_report(y_test, y_pred))



                                 precision    recall  f1-score   support

             Business/Corporate       0.75      0.88      0.81        41
       Computers and Technology       0.69      0.93      0.79        29
                     E-Commerce       0.93      0.88      0.90        32
                      Education       0.89      1.00      0.94        33
                           Food       0.91      0.95      0.93        21
                         Forums       1.00      0.12      0.22         8
                          Games       0.93      0.93      0.93        30
             Health and Fitness       0.96      0.87      0.91        30
             Law and Government       0.94      0.94      0.94        31
                           News       0.92      0.88      0.90        26
                    Photography       1.00      0.90      0.95        20
Social Networking and Messaging       0.92      0.71      0.80        17
                         Sports       0.97      0.