In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LinearRegression, LogisticRegression

# Working on Training Data

In [None]:
train_df = pd.read_csv("../input/aid-escalating-internet-coverage/train.csv")
train_DF = train_df.copy()

In [None]:
train_df.head()

In [None]:
# Describing the training dataset
train_df.describe()

In [None]:
output_label = train_df['label']

In [None]:
output_label

In [None]:
# Checking for null values in the data frame
train_df.isna().sum()

In [None]:
# Checking for special values in the dataset
(train_df == '?').sum()

In [None]:
train_df['frame_based'].unique()

In [None]:
# Since the frame_based column only contains '0' as value so we are dropping that column
train_df.drop(axis = 'columns', labels = 'frame_based', inplace = True)

In [None]:
train_df['is_news'].unique()

In [None]:
# Since is_news column only have '1' and '?' value so this creates ambiguity so it's better to remove it
train_df.drop(axis = 'columns', labels = 'is_news', inplace = True)

In [None]:
# Dropping this columns for now, later we will use web scrapping on this column
train_df.drop(axis = "columns", labels = 'link', inplace = True)

In [None]:
# Dropping the link_id column
train_df.drop(axis = 'columns', labels = 'link_id', inplace = True)

## Predicting alchemy_category_score column

In [None]:
train_df['alchemy_category'].unique()

In [None]:
categories = train_df['alchemy_category'].unique()

print(categories)

for i, cat in enumerate(categories):
    train_df['alchemy_category'] = train_df['alchemy_category'].replace(cat, i)

In [None]:
train_df['alchemy_category'].unique()

In [None]:
train = train_df[train_df['alchemy_category'] != 4]
test = train_df[train_df['alchemy_category'] == 4]

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio']]
train_Y = train['alchemy_category']

test_X = test[['avg_link_size','common_word_link_ratio_1','common_word_link_ratio_2','common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio']]
test_Y = test['alchemy_category']

In [None]:
naive_bayes_model = GaussianNB()

In [None]:
fitting = naive_bayes_model.fit(train_X, train_Y)
pred_Y = fitting.predict(test_X)

In [None]:
pred_Y

In [None]:
test['alchemy_category'] = pred_Y

In [None]:
frames = [train, test]

train_df = pd.concat(frames)
display(train_df)

In [None]:
train_df.shape

In [None]:
print(categories)

In [None]:
for i, cat in enumerate(categories):
    train_df['alchemy_category'] = train_df['alchemy_category'].replace(i, cat)

In [None]:
train_df['alchemy_category'].unique()

## Applying One-Hot Encoding on alchemy_category column

In [None]:
train_df = pd.get_dummies(train_df, columns = ['alchemy_category'])

In [None]:
(train_df == '?').sum()

## Predicting alchemy_category_score column

In [None]:
train_df['alchemy_category_score']

In [None]:
train = train_df[train_df['alchemy_category_score'] != '?']
test = train_df[train_df['alchemy_category_score'] == '?']

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio', 'lengthy_link_domain', 'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links', 'number_of_words_in_url', 'parametrized_link_ratio', 'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_business', 'alchemy_category_computer_internet', 'alchemy_category_culture_politics', 'alchemy_category_gaming', 'alchemy_category_health', 'alchemy_category_law_crime', 'alchemy_category_recreation', 'alchemy_category_religion', 'alchemy_category_science_technology', 'alchemy_category_sports', 'alchemy_category_unknown', 'alchemy_category_weather']]
train_Y = train['alchemy_category_score']

test_X = test[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio', 'lengthy_link_domain', 'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links', 'number_of_words_in_url', 'parametrized_link_ratio', 'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_business', 'alchemy_category_computer_internet', 'alchemy_category_culture_politics', 'alchemy_category_gaming', 'alchemy_category_health', 'alchemy_category_law_crime', 'alchemy_category_recreation', 'alchemy_category_religion', 'alchemy_category_science_technology', 'alchemy_category_sports', 'alchemy_category_unknown', 'alchemy_category_weather']]
test_Y = test['alchemy_category_score']

In [None]:
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

In [None]:
linear_regression = LinearRegression()

linear_regression.fit(train_X, train_Y)

pred_Y = linear_regression.predict(test_X)

In [None]:
pred_Y

In [None]:
test['alchemy_category_score'] = pred_Y

In [None]:
frames = [train, test]

train_df = pd.concat(frames)
display(train_df)

In [None]:
train_df['alchemy_category_score'] = pd.to_numeric(train_df['alchemy_category_score'], errors = 'coerce')

## Predicting news_front_page column

In [None]:
train_df.columns

In [None]:
train_df['news_front_page']

In [None]:
train_df['news_front_page'].unique()

In [None]:
train = train_df[train_df['news_front_page'] != '?']
test = train_df[train_df['news_front_page'] == '?']

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['alchemy_category_score', 'avg_link_size',
       'common_word_link_ratio_1', 'common_word_link_ratio_2',
       'common_word_link_ratio_3', 'common_word_link_ratio_4',
       'compression_ratio', 'embed_ratio', 'frame_tag_ratio',
       'has_domain_link', 'html_ratio', 'image_ratio', 'lengthy_link_domain',
       'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links',
       'number_of_words_in_url', 'parametrized_link_ratio',
       'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_business',
       'alchemy_category_computer_internet',
       'alchemy_category_culture_politics', 'alchemy_category_gaming',
       'alchemy_category_health', 'alchemy_category_law_crime',
       'alchemy_category_recreation', 'alchemy_category_religion',
       'alchemy_category_science_technology', 'alchemy_category_sports',
       'alchemy_category_unknown', 'alchemy_category_weather']]
train_Y = train['news_front_page']

test_X = test[['alchemy_category_score', 'avg_link_size',
       'common_word_link_ratio_1', 'common_word_link_ratio_2',
       'common_word_link_ratio_3', 'common_word_link_ratio_4',
       'compression_ratio', 'embed_ratio', 'frame_tag_ratio',
       'has_domain_link', 'html_ratio', 'image_ratio', 'lengthy_link_domain',
       'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links',
       'number_of_words_in_url', 'parametrized_link_ratio',
       'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_business',
       'alchemy_category_computer_internet',
       'alchemy_category_culture_politics', 'alchemy_category_gaming',
       'alchemy_category_health', 'alchemy_category_law_crime',
       'alchemy_category_recreation', 'alchemy_category_religion',
       'alchemy_category_science_technology', 'alchemy_category_sports',
       'alchemy_category_unknown', 'alchemy_category_weather']]
test_Y = test['news_front_page']

In [None]:
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

In [None]:
model = LogisticRegression(max_iter = 10000)

model.fit(train_X, train_Y)

pred_Y = model.predict(test_X)

In [None]:
pred_Y

In [None]:
test['news_front_page'] = pred_Y

In [None]:
frames = [train, test]

train_df = pd.concat(frames)
display(train_df)

In [None]:
train_df['news_front_page'] = pd.to_numeric(train_df['news_front_page'], errors = 'coerce')

In [None]:
train_df.dtypes

## Normalizing the training data

In [None]:
# Using Z-Normalization for all the columns
# Feature Scaling...
for column in train_df.columns:
    if (column != 'label' and column != 'page_description'):
        train_df[column] = (train_df[column] - train_df[column].mean()) / train_df[column].std()

In [None]:
train_df.shape

# Performing opertions on Testing Data

In [None]:
test_df = pd.read_csv("../input/aid-escalating-internet-coverage/test.csv")
test_DF = test_df.copy()

In [None]:
test_df.head()

In [None]:
# Describing the training dataset
test_df.describe()

In [None]:
# Checking for null values in the data frame
test_df.isna().sum()

In [None]:
# Checking for special values in the dataset
(test_df == '?').sum()

In [None]:
test_df['frame_based'].unique()

In [None]:
# Since the frame_based column only contains '0' as value so we are dropping that column
test_df.drop(axis = 'columns', labels = 'frame_based', inplace = True)

In [None]:
test_df['is_news'].unique()

In [None]:
# Since is_news column only have '1' and '?' value so this creates ambiguity so it's better to remove it
test_df.drop(axis = 'columns', labels = 'is_news', inplace = True)

In [None]:
# Dropping this columns for now, later we will use web scrapping on this column
test_df.drop(axis = "columns", labels = 'link', inplace = True)

In [None]:
# Dropping the link_id column
test_df.drop(axis = 'columns', labels = 'link_id', inplace = True)

## Predicting alchemy_category column

In [None]:
test_df['alchemy_category'].unique()

In [None]:
# categories = test_df['alchemy_category'].unique()

print(categories)

for i, cat in enumerate(categories):
    test_df['alchemy_category'] = test_df['alchemy_category'].replace(cat, i)

In [None]:
test_df['alchemy_category'].unique()

In [None]:
train = test_df[test_df['alchemy_category'] != 4]
test = test_df[test_df['alchemy_category'] == 4]

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio']]
train_Y = train['alchemy_category']

test_X = test[['avg_link_size','common_word_link_ratio_1','common_word_link_ratio_2','common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio']]
test_Y = test['alchemy_category']

In [None]:
naive_bayes_model = GaussianNB()

In [None]:
fitting = naive_bayes_model.fit(train_X, train_Y)
pred_Y = fitting.predict(test_X)

In [None]:
pred_Y

In [None]:
test['alchemy_category'] = pred_Y

In [None]:
frames = [train, test]

test_df = pd.concat(frames)
display(test_df)

In [None]:
test_df.shape

In [None]:
test_df['alchemy_category'].unique()

In [None]:
print(categories)

In [None]:
for i, cat in enumerate(categories):
    test_df['alchemy_category'] = test_df['alchemy_category'].replace(i, cat)

In [None]:
test_df['alchemy_category'].unique()

## Applying One-hot encoding on alchemy_category column

In [None]:
test_df = pd.get_dummies(test_df, columns = ['alchemy_category'])

In [None]:
(test_df == '?').sum()

## Predicting alchemy_category_score column

In [None]:
test_df['alchemy_category_score']

In [None]:
train = test_df[test_df['alchemy_category_score'] != '?']
test = test_df[test_df['alchemy_category_score'] == '?']

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio', 'lengthy_link_domain', 'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links', 'number_of_words_in_url', 'parametrized_link_ratio', 'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_computer_internet', 'alchemy_category_culture_politics', 'alchemy_category_gaming', 'alchemy_category_health', 'alchemy_category_law_crime', 'alchemy_category_recreation', 'alchemy_category_religion', 'alchemy_category_science_technology', 'alchemy_category_sports']]
train_Y = train['alchemy_category_score']

test_X = test[['avg_link_size', 'common_word_link_ratio_1', 'common_word_link_ratio_2', 'common_word_link_ratio_3','common_word_link_ratio_4','compression_ratio','frame_tag_ratio','has_domain_link','html_ratio','image_ratio', 'lengthy_link_domain', 'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links', 'number_of_words_in_url', 'parametrized_link_ratio', 'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment', 'alchemy_category_computer_internet', 'alchemy_category_culture_politics', 'alchemy_category_gaming', 'alchemy_category_health', 'alchemy_category_law_crime', 'alchemy_category_recreation', 'alchemy_category_religion', 'alchemy_category_science_technology', 'alchemy_category_sports']]
test_Y = test['alchemy_category_score']

In [None]:
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

In [None]:
linear_regression = LinearRegression()

linear_regression.fit(train_X, train_Y)

pred_Y = linear_regression.predict(test_X)

In [None]:
pred_Y

In [None]:
test['alchemy_category_score'] = pred_Y

In [None]:
frames = [train, test]

test_df = pd.concat(frames)
display(test_df)

In [None]:
test_df['alchemy_category_score'] = pd.to_numeric(test_df['alchemy_category_score'], errors = 'coerce')

## Predicting news_front_page column

In [None]:
test_df.columns

In [None]:
test_df['news_front_page']

In [None]:
test_df['news_front_page'].unique()

In [None]:
train = test_df[test_df['news_front_page'] != '?']
test = test_df[test_df['news_front_page'] == '?']

print(train.columns)
print(test.columns)
print(train.shape)
print(test.shape)

In [None]:
train_X = train[['alchemy_category_score', 'avg_link_size',
       'common_word_link_ratio_1', 'common_word_link_ratio_2',
       'common_word_link_ratio_3', 'common_word_link_ratio_4',
       'compression_ratio', 'embed_ratio', 'frame_tag_ratio',
       'has_domain_link', 'html_ratio', 'image_ratio', 'lengthy_link_domain',
       'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links',
       'number_of_words_in_url', 'parametrized_link_ratio',
       'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment',
       'alchemy_category_computer_internet',
       'alchemy_category_culture_politics', 'alchemy_category_gaming',
       'alchemy_category_health', 'alchemy_category_law_crime',
       'alchemy_category_recreation', 'alchemy_category_religion',
       'alchemy_category_science_technology', 'alchemy_category_sports']]
train_Y = train['news_front_page']

test_X = test[['alchemy_category_score', 'avg_link_size',
       'common_word_link_ratio_1', 'common_word_link_ratio_2',
       'common_word_link_ratio_3', 'common_word_link_ratio_4',
       'compression_ratio', 'embed_ratio', 'frame_tag_ratio',
       'has_domain_link', 'html_ratio', 'image_ratio', 'lengthy_link_domain',
       'link_word_score', 'non_markup_alphanumeric_characters', 'count_of_links',
       'number_of_words_in_url', 'parametrized_link_ratio',
       'spelling_mistakes_ratio', 'alchemy_category_arts_entertainment',
       'alchemy_category_computer_internet',
       'alchemy_category_culture_politics', 'alchemy_category_gaming',
       'alchemy_category_health', 'alchemy_category_law_crime',
       'alchemy_category_recreation', 'alchemy_category_religion',
       'alchemy_category_science_technology', 'alchemy_category_sports']]
test_Y = test['news_front_page']

In [None]:
print(train_X.shape)
print(test_X.shape)
print(train_Y.shape)
print(test_Y.shape)

In [None]:
model = LogisticRegression(max_iter = 10000)

model.fit(train_X, train_Y)

pred_Y = model.predict(test_X)

In [None]:
pred_Y

In [None]:
test['news_front_page'] = pred_Y

In [None]:
frames = [train, test]

test_df = pd.concat(frames)
display(test_df)

In [None]:
test_df['news_front_page'] = pd.to_numeric(test_df['news_front_page'], errors = 'coerce')

In [None]:
test_df.dtypes

## Normalizing the testing data

In [None]:
# Using Z-Normalization for all the columns
# Feature Scaling...
for column in test_df.columns:
    if (column != 'label' and column != 'page_description'):
        test_df[column] = (test_df[column] - test_df[column].mean()) / test_df[column].std()

In [None]:
test_df.shape

# Combining training and testing data for NLP

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
train_df_output = train_df['label']

# Dropping extra columns from training dataframe for combining the training and testing data
train_df.drop(axis = 'columns', labels = ['label', 'alchemy_category_unknown', 'alchemy_category_weather'], inplace = True)
# train_df.drop(axis = 'columns', labels = 'label', inplace = True)

In [None]:
count = 0
for i in range(len(train_df_output)):
    if (train_df_output[i] == output_label[i]):
        count += 1;

count

In [None]:
# train_df.drop(axis = 'columns', labels = ['label'], inplace = True)

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.columns

In [None]:
test_df.columns

## Combining training and testing data for NLP

In [None]:
# frames = [train_df, test_df]
frames = [train_DF, test_DF]

dataFrame = pd.concat(frames)

In [None]:
dataFrame.shape

# Applying NLP on page_description column in DataFrame

In [None]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
stop_words = stopwords.words('english')
# print(stop_words)

In [None]:
stop_words.remove('not')

In [None]:
# Getting the page_description column
sentences = dataFrame['page_description'].tolist()

In [None]:
type(sentences)

In [None]:
# Cleaning the data
import re
output = []
for sen in sentences:
    var = re.sub('[^a-zA-Z]', ' ', sen)
    var = var.lower()
    output.append(var)
sentences = output

In [None]:
# Lemmatization...

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = []
for sentence in sentences:
    sen_list = nltk.word_tokenize(sentence)
    str = []
    for word in sen_list:
        if (word != 'url' and word != 'title' and word != 'body' and word != 'label' and not word in set(stop_words)):
            str.append(lemmatizer.lemmatize(word))
    string = ''
    for element in str:
        string += ' ' + element
    corpus.append(string)

In [None]:
corpus[1]

In [None]:
# Applying TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# tfidf = TfidfVectorizer(ngram_range = (1, 3), max_features = 100)
tfidf = TfidfVectorizer(ngram_range = (1, 5), max_features = 40000)
## max_features --> it means that many top features by tfidf will be selected

In [None]:
tfidf_vector = tfidf.fit_transform(corpus).toarray()

In [None]:
tfidf_vector.shape

## Combining the TF-IDF vector and DataFrame

In [None]:
column_names = tfidf.get_feature_names_out()

In [None]:
type(column_names)

In [None]:
print(len(column_names))
print(tfidf_vector.shape)

In [None]:
tfidf_df = pd.DataFrame(data = tfidf_vector, columns = column_names)

In [None]:
tfidf_df

In [None]:
# if 'label' in tfidf_df.columns:
#     tfidf_df.drop(axis = 'columns', labels = 'label', inplace = True)

# if 'label' in tfidf_df.columns:
#     print('Exists')
# else:
#     print('Not Exists')

In [None]:
tfidf_df.shape

In [None]:
dataFrame.shape

In [None]:
dataFrame.reset_index(drop = True, inplace = True)

In [None]:
# dataFrame = pd.concat([dataFrame, tfidf_df], axis = 1)

In [None]:
# dataFrame.shape

In [None]:
# dataFrame.drop(axis = 'columns', labels = 'page_description', inplace = True)

In [None]:
# train_df = dataFrame.iloc[: 4437, : ]
# test_df = dataFrame.iloc[4437 : , : ]
train_df = tfidf_df.iloc[: 4437, : ]
test_df = tfidf_df.iloc[4437 : , : ]

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df_output.shape

In [None]:
train_df_output

In [None]:
train_df.info()

# Applying Support Vector Machine Algorithm

In [None]:
svm_model = svm.SVC(C = 1.0, kernel = 'rbf', degree = 3, coef0 = 0.0, shrinking = True, probability = False, tol = 0.001, cache_size = 200, class_weight = None, verbose = False, max_iter = -1, random_state = None)

In [None]:
pred_Y = model.fit(train_df, output_label).predict(test_df)

In [None]:
pred_Y.shape

In [None]:
pred_Y

In [None]:
main_df = pd.read_csv("../input/aid-escalating-internet-coverage/test.csv")

In [None]:
main_df = main_df.iloc[ : , 1 : 2]

In [None]:
main_df = main_df.to_numpy()

In [None]:
pred_Y = pred_Y.reshape((pred_Y.shape[0], 1))

In [None]:
pred_Y.shape

In [None]:
output_df = np.concatenate((main_df, pred_Y), axis = 1)

In [None]:
output_df.shape

In [None]:
output_df = pd.DataFrame(output_df, columns = ['link_id', 'label'])

In [None]:
output_df['link_id'] = output_df['link_id'].astype(int)

In [None]:
output_df

In [None]:
output_df.to_csv("./submission_svm.csv", index = False)