In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("OnlineNewsPopularity.csv")
data.shape

(39644, 61)

In [3]:
data.isna().any()
data.drop(columns='url',inplace=True)

In [4]:
def popular(shares):
    sum = 0
    popular_list = []
    for i in shares:
        sum+=i
    avg = sum/len(shares)
    for i in shares:
        if i >= avg:
            popular_list.append(True)
        else:
            popular_list.append(False)
    return popular_list

In [5]:
shares = data[' shares']
popularity = popular(shares)
data['Popularity'] = popularity
col = data.columns

In [6]:
pearson_corr = data.corr(method="pearson")['Popularity'][:-1]
kendall_corr = data.corr(method="kendall")['Popularity'][:-1]
spearman_corr = data.corr(method="spearman")["Popularity"][:-1]

In [7]:
nrows,ncols = data.shape
attr_selection = [] 
for i in range(ncols-2):
    if pearson_corr[i] > 0 and spearman_corr[i] > 0 and kendall_corr[i] > 0:
        attr_selection.append(data.columns[i+1])
attr_selection

[' n_tokens_title',
 ' num_self_hrefs',
 ' num_imgs',
 ' num_videos',
 ' average_token_length',
 ' data_channel_is_lifestyle',
 ' data_channel_is_entertainment',
 ' data_channel_is_tech',
 ' data_channel_is_world',
 ' kw_max_min',
 ' kw_avg_min',
 ' kw_min_max',
 ' kw_max_max',
 ' kw_min_avg',
 ' kw_max_avg',
 ' kw_avg_avg',
 ' self_reference_min_shares',
 ' self_reference_max_shares',
 ' self_reference_avg_sharess',
 ' weekday_is_monday',
 ' weekday_is_sunday',
 ' is_weekend',
 ' LDA_00',
 ' LDA_04',
 ' global_sentiment_polarity',
 ' global_rate_positive_words',
 ' global_rate_negative_words',
 ' rate_positive_words',
 ' min_positive_polarity',
 ' avg_negative_polarity',
 ' title_sentiment_polarity',
 ' abs_title_subjectivity',
 ' shares']

In [8]:
data[' shares'] = (data[' shares'] > 1400 ).astype(int)
X = data[attr_selection]
X = X.drop(columns=' shares')
X.head()

Unnamed: 0,n_tokens_title,num_self_hrefs,num_imgs,num_videos,average_token_length,data_channel_is_lifestyle,data_channel_is_entertainment,data_channel_is_tech,data_channel_is_world,kw_max_min,...,LDA_00,LDA_04,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,min_positive_polarity,avg_negative_polarity,title_sentiment_polarity,abs_title_subjectivity
0,12.0,2.0,1.0,0.0,4.680365,0.0,1.0,0.0,0.0,0.0,...,0.500331,0.040123,0.092562,0.045662,0.013699,0.769231,0.1,-0.35,-0.1875,0.0
1,9.0,1.0,1.0,0.0,4.913725,0.0,0.0,0.0,0.0,0.0,...,0.799756,0.050001,0.148948,0.043137,0.015686,0.733333,0.033333,-0.11875,0.0,0.5
2,9.0,1.0,1.0,0.0,4.393365,0.0,0.0,0.0,0.0,0.0,...,0.217792,0.682188,0.323333,0.056872,0.009479,0.857143,0.1,-0.466667,0.0,0.5
3,9.0,0.0,1.0,0.0,4.404896,0.0,1.0,0.0,0.0,0.0,...,0.028573,0.028572,0.100705,0.041431,0.020716,0.666667,0.136364,-0.369697,0.0,0.5
4,13.0,19.0,20.0,0.0,4.682836,0.0,0.0,1.0,0.0,0.0,...,0.028633,0.885427,0.281003,0.074627,0.012127,0.860215,0.033333,-0.220192,0.136364,0.045455


In [9]:
y = data[' shares']
y.head()

0    0
1    0
2    1
3    0
4    0
Name:  shares, dtype: int32

In [10]:
import math
import random
import statistics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

In [11]:
def init_population(p_size, c, top_num):
    population = []
    for i in range(p_size):
        individual = [0]*c
        j = 0
        while(j<top_num):
            p = random.uniform(0,1)
            pos = random.randrange(c)
            if p >= 0.5 and individual[pos] == 0:
                individual[pos] = 1
                j = j+1
        if sum(individual) == 0:
            pos = random.randrange(c)
            individual[pos] = 1
        population.append(individual)
    #print("Population is ", population)
    #print(population)
    return population

In [12]:
def calculate_fitness(features, target):
    model = MLPClassifier()
    scores = cross_val_score(model, features, target, scoring='f1_macro',n_jobs=-1,cv=10)
    #print(scores)
    #print(scores.mean())
    return scores.mean()

In [13]:
def get_fitness(population, data, target):
    fit_values = []
    for individual in population:
        #print("Individual is ",individual)
        df = data
        i=0
        for column in data:
            if individual[i] == 0:
                df = df.drop(column, axis=1)
            i = i+1
        features = df
        #print(features)
        individual_fitness = calculate_fitness(features, target)
        fit_values.append(individual_fitness)
    return fit_values

In [14]:
def select_parents(population, fit_values):
    parents = []
    total = sum(fit_values)
    #print(total)
    norm_fit_values = [x/total for x in fit_values]
    #print(norm_fit_values)
    cumulative_fitness = []
    i = 0
    for norm in norm_fit_values:
        i += norm
        cumulative_fitness.append(norm)
    #print(cumulative_fitness)
    p_size = len(population)
    #print(p_size)
    for count in range(p_size):
        random_num = random.uniform(0,1)
        individual_num = 0
        for score in cumulative_fitness:
            if random_num <= score:
                parents.append(population[individual_num])
                break
            individual_num+=1
    #print(parents)
    return parents

In [15]:
def crossover(parents, probability):
    random.shuffle(parents)
    num_of_pairs = round(len(parents)*probability/2)
    #print(num_of_pairs)
    chromosome_len = len(parents[0])
    crossover_population = []
    for i in range(num_of_pairs):
        length = len(parents)
        parent1_index = random.randrange(length)
        parent2_index = random.randrange(length)
        while(parent1_index == parent2_index):
            parent2_index = random.randrange(length)
        start = random.randrange(chromosome_len)
        end = random.randrange(chromosome_len)
        if start > end:
            start,end = end, start
        parent1 = parents[parent1_index]
        parent2 = parents[parent2_index]
        child1 =  parent1[0:start] 
        child1.extend(parent2[start:end])
        child1.extend(parent1[end:])
        child2 =  parent2[0:start]
        child2.extend(parent1[start:end])
        child2.extend(parent2[end:])
        parents.remove(parent1)
        parents.remove(parent2)
        crossover_population.append(child1)
        crossover_population.append(child2)
    if (len(parents) > 0):
        for remaining_parents in parents:
            crossover_population.append(remaining_parents)
    return crossover_population

In [16]:
def mutation(crossover_population):
    for individual in crossover_population:
        i_1 = random.randrange(len(individual))
        i_2 = random.randrange(len(individual))
        while (i_2 == i_1) and individual[i_1] != individual[i_2]:
            i_2 = random.randrange(len(individual))
        t = individual[i_1]
        individual[i_1] = individual[i_2]
        individual[i_2] = t
    return crossover_population

In [17]:
def genetic_algo(data,features,target,population_size,tol_level,top_number):
    c = data.shape[1]
    population = init_population(population_size, c, top_number)
    fitness_values = get_fitness(population, data, target)
    parents = select_parents(population, fitness_values)
    #print(parents)
    crossover_population = crossover(parents, 0.8)
    population = crossover_population
    p = random.uniform(0,1)
    if (p <= 0.001):
        mutated_population = mutation(crossover_population)
        population = mutated_population
    fitness_values = get_fitness(population, data, target)
    var_of_population = np.var(fitness_values)
    #print("Variance is ", var_of_population)
    count_of_gen = 1
    while(var_of_population > tol_level):
        print('Generations: ', count_of_gen)
        parents = select_parents(population, fitness_values)
        crossover_population = crossover(parents, 0.8)
        population = crossover_population
        p = random.uniform(0,1)
        if (p <= 0.001):
            mutated_population = mutation(crossover_population)
            population = mutated_population
        fitness_values = get_fitness(population, data, target)
        var_of_population = np.var(fitness_values)
        #print("Variance is ", var_of_population)
        count_of_gen+=1
    best_features = []
    best_f1_score = 0
    optimal_fitness = sum(fitness_values)/ len(fitness_values)
    print("Average fitness is: ", optimal_fitness)
    for i,fit_value in enumerate(fitness_values):
        err = abs((fit_value - optimal_fitness)/optimal_fitness)
        if err <= 0.01:
            best_features = population[i]
            best_f1_score = fitness_values
    print(best_features)
    return best_features,best_f1_score

In [18]:
top_features, best_f1_score = genetic_algo(X,X,y,20, 0.000005, 25)
i = 0
list_of_features = []
for i in range(len(top_features)):
    if(top_features[i]==1):
        list_of_features.append(X.columns[i])

0.436434090429637
0.43227007537656964
0.44707101265874805
0.39076230968971293
0.5442987694794518
0.38310782338111027
0.4241705865456512
0.4099556429775548
0.5231011512597057
0.43038635800373265
0.4847494048771921
0.43855393627866307
0.38361128819819246
0.41399469669948796
0.5222670475850478
0.502138065632829
0.41395008100300873
0.4094057641280056
0.4731253237322939
0.4771456559609816
0.3855221878193918
0.4288107198728398
0.378699481245144
0.4783556494991384
Generations:  1
0.4987895059339742
0.5361748167430239
0.4232406182497428
Generations:  2
0.5459182346421575
0.4996196132849514
Generations:  3
0.5495793801392604
0.48035734402602975
Generations:  4
0.5112642511538608
Average fitness is:  0.5112642511538608
[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]


In [19]:
print(top_features)
print(list_of_features)
print(best_f1_score)

[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1]
[' n_tokens_title', ' num_self_hrefs', ' num_imgs', ' num_videos', ' average_token_length', ' data_channel_is_entertainment', ' data_channel_is_tech', ' data_channel_is_world', ' kw_max_min', ' kw_avg_min', ' kw_min_max', ' kw_min_avg', ' kw_max_avg', ' kw_avg_avg', ' self_reference_max_shares', ' weekday_is_monday', ' weekday_is_sunday', ' LDA_00', ' LDA_04', ' global_sentiment_polarity', ' global_rate_positive_words', ' global_rate_negative_words', ' title_sentiment_polarity', ' abs_title_subjectivity']
[0.5112642511538608]


In [20]:
print(len(list_of_features))

24
