In [1]:
import json 
import ijson
import pandas as pd
import re, os
import random
from collections import Counter

In [2]:
country_code_df= pd.read_csv('database/country_code.csv' ,sep = ',')
country_code = country_code_df.values.tolist()

domain_df= pd.read_csv('database/domain.csv' ,sep = '.')
domain_code = domain_df.values.tolist()

flags_df= pd.read_csv('database/flags_new.csv' ,sep = ',')
flags = flags_df.values.tolist()

language_code_df= pd.read_csv('database/language_code.csv' ,sep = ':')
language_code = language_code_df.values.tolist()
    
cities_df= pd.read_csv('database/worldcities.csv' ,sep = ',')
cities = cities_df.values.tolist()

country_to_code = [row[1] for row in country_code]
country_class = { name: k+1 for k,name in enumerate(country_to_code) }#create class for each country code


In [3]:
### all the functiones needed for this project
def tokenize(s): return re.sub('([^A-Za-z0-9 ]+)', ' \\1 ', s).split()  # add space around anything not alphanum
def sanify(s): return s.replace(':',';').replace('|','/')

def get_key(dicti,val):
    for key, value in dicti.items():
         if val == value:
             return key
 
    return "There is no such Key"

def writeToVWFile(filename, examples):
    with open(filename, 'w') as h:
        for ex in examples:
            h.write(ex.strip()+'\n')
def pop_2(tokens,n=2):# 
    pop_list =[]
    for ii in tokens:
        if len(ii)<=n:
            pop_list.append(ii)
    for ii in pop_list:
        tokens.remove(ii)
    return tokens
    
def check_flag(tokens):
    labeled = False
    label = None
    #tokens = pop_2(tokens)
    for token in tokens: #check for countries with one part
        if token in flags_df.values:
            index_row = [flags.index(row) for row in flags if token in row[0]]
            if index_row:
                label_char =flags[index_row[0]][1]
                label_char =str(label_char).lower()
                label = country_class[label_char]
                labeled =True
                break
    return labeled , label

def city_country_check(place):
    is_labeled = False
    label = None
    place = pop_2(place,3)
    for token in place: #check for countries with one part
        if token in country_code_df.values :
            index_row = [country_code.index(row) for row in country_code if token in row[0]]
            label_char =country_code[index_row[0]][1]
            label_char =str(label_char).lower()
            label = country_class[label_char]
            is_labeled= True
            break
    if not is_labeled:
        for k in range(len(place)-1): #for cities like new york or mexico city
            token ='%s %s' % (place[k],place[k+1])
            token = str(token).lower()
            #print(token)
            if token in cities_df.values :
                index_row = [cities.index(row) for row in cities if token == row[0] or token == row[4]]
                if index_row:
                    label_char =cities[index_row[0]][5]
                    label_char =str(label_char).lower()
                    label = country_class[label_char]
                    is_labeled= True
                    break
    if not is_labeled:
        for token in place: #check for cities with one part
            if token in cities_df.values :
                index_row = [cities.index(row) for row in cities if token == row[0] or token == row[4]]
                if index_row:
                    label_char =cities[index_row[0]][5]
                    label_char =str(label_char).lower()
                    label = country_class[label_char]
                    is_labeled= True
                    break
    return is_labeled , label

def operation(data):
    examples = []
    labels_list = []
    for i in range(len(data)):
        label = None
        namespaces = {}
        is_labeled= False
        if data[i]["place"] != None:
            #print(data[i]["place"])
            label_char = data[i]["place"]['country_code']
            label_char =str(label_char).lower()
            namespaces['p'] = [label_char]
            label = country_class[label_char]
            is_labeled= True

        if data[i]["user"]["location"] != "":
            place= tokenize(data[i]["user"]["location"].strip())
            place=pop_2(place)
            text = []
            text += place
            #print(place)
            namespaces['l'] = text
            if not is_labeled:
                is_labeled , label =city_country_check(place)

        if data[i]["user"]["name"] != "" and not is_labeled:
            user_name= tokenize(data[i]["user"]["name"].strip())
            namespaces['n'] = user_name
            if not is_labeled:
                is_labeled , label = check_flag(user_name)
            if not is_labeled:
                is_labeled , label =city_country_check(user_name)

        if data[i]["user"]["screen_name"] != "" and not is_labeled:
            screen_name= tokenize(data[i]["user"]["screen_name"].strip())
            namespaces['s'] = screen_name
            if not is_labeled:
                is_labeled , label = check_flag(screen_name)
            if not is_labeled:
                is_labeled , label =city_country_check(screen_name)

        if data[i]["user"]["description"] != "" and not is_labeled:
            description= tokenize(data[i]["user"]["description"].strip())
            namespaces['d'] = description
            if not is_labeled:
                is_labeled , label = check_flag(description)
            if not is_labeled:
                is_labeled , label =city_country_check(description)

        if data[i]["full_text"] != "":
            full_text= tokenize(data[i]["full_text"].strip())
            full_text = pop_2(full_text , 1)
            namespaces['f'] = full_text
            if not is_labeled:
                is_labeled , label = check_flag(full_text)

        if not is_labeled:
            if 'url' in data[i]["user"]["entities"]:
                if 'display_url' in data[i]["user"]["entities"]['url']['urls'][0]:
                    if data[i]["user"]["entities"]['url']['urls'][0]['display_url'] != 'None':
                        url = data[i]["user"]["entities"]['url']['urls'][0]['display_url']
                        #print(url)
                        x = re.search("(\.[^c][^o]($|\W))", url)
                        #print(x)
                        if x!=None:
                            x = re.search("(\w\w)", x[0])
                            if x!=None:
                                if x[0] in country_code_df.values:
                                    namespaces['m'] = x[0] 
        if data[i]["lang"] != "":
            lang= tokenize(data[i]["lang"].strip())
            #print(lang)
            if lang[0] in language_code_df.values:
                index_row = [language_code.index(row) for row in language_code if lang[0] in row]
                #print(language_code[index_row[0]][0])
                namespaces['g'] = [language_code[index_row[0]][0]]
                
        if label!=None:
            labels_list.append(label)
            ex = ''
            for ns,words in namespaces.items():
                ex += ' |%s %s ' % (ns, ' '.join(map(sanify,words)))
            examples += [ex]
    return examples , labels_list

def write_files(file_name,examples):
    random.seed(9999)
    random.shuffle(examples)
    middle = int(len(examples)/2)
    train_data = examples[:middle]
    test_data = examples[middle:]
    test_file_name = file_name+".te"
    train_file_name = file_name+".tr"
    with open(test_file_name, 'w') as h:
        for ex in test_data:
                h.write(ex.strip()+'\n')
    with open(train_file_name, 'w') as m:
        for ex in train_data:
                m.write(ex.strip()+'\n')
    return

def take_top_n(labels_list,n):# find top n-1 + others
    res = Counter(labels_list)
    items = list(res.items())
    sorted_list = sorted(items,key=lambda x: (x[1],x[0]),reverse=True)
    fir = sorted_list[:n-1]
    rest = sorted_list[n-1:]
    contry_code_list = [i[0] for i in fir]
    new_country_class = { name: k+1 for k,name in enumerate(contry_code_list) }
    return contry_code_list,new_country_class

def create_new_label(labels_list,n):# to reduce the number of classe to n
    contry_top_list,top_country_class=take_top_n(labels_list,n)
    new_label_list =[]
    for i in range(len(labels_list)):
        label = labels_list[i]
        if label in contry_top_list:
            new_label = top_country_class[label]
        else: 
            new_label = n
        new_label_list.append(new_label)
    return new_label_list, contry_top_list

def add_labels(examples , labels_list):
    new_examples = []
    for i in range(len(labels_list)):
        ex = str(labels_list[i])
        ex +='%s' % examples[i]
        new_examples += [ex]
    return new_examples

def create_vw_file(json_file,n = 20):# write the file name without .json
    with open(json_file+".json","r") as f:
        data=json.load(f)

    examples1 , labels_list1 = operation(data)
    print(labels_list1, examples1)
    new_label_list , contry_top_list= create_new_label(labels_list1,n)
    #top_country_code_list = [get_key(country_class,val-1) for val in new_label_list]
    #print(top_country_code_list)
    #top_country_list=[]
    #for token in top_country_code_list:
    #        top_country_list.append([row[0] for row in country_code if token in row[1]])
    new_examples = add_labels(examples1 , new_label_list)
    write_files("project/%s" %json_file,new_examples)
    return contry_top_list

In [7]:
top_country_list = create_vw_file("easter1",10)

[71, 66, 239, 76, 14, 239, 175, 175, 182, 166, 239, 86, 60, 239, 168, 239, 197, 238, 86, 234, 36, 82, 234, 86, 238, 239, 86, 239, 112, 86, 222, 239, 60, 203, 234, 239, 239, 234, 239, 239, 182, 234, 183, 14, 239, 238, 15, 162, 66, 239, 239, 239, 71, 239, 234, 86, 239, 239, 239, 239, 86, 60, 238, 238, 16, 238, 234, 239, 86, 239, 234, 234, 234, 91, 151, 239, 183, 234, 239, 234, 86, 239, 238, 238, 234, 238, 238, 60, 166, 239, 109, 238, 239, 166, 234, 234, 175, 234, 239, 222, 239, 239, 76, 239, 234, 222, 177, 166, 61, 239, 238, 40, 71, 239, 234, 239, 238, 239, 239, 222, 32, 166, 59, 239, 239, 234, 49, 239, 238, 239, 238, 234, 239, 234, 239, 86, 234, 234, 206, 86, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, 166, 239, 239, 166, 166, 239, 239, 239, 234, 238, 239, 144, 238, 238, 222, 110, 86, 178, 183, 86, 234, 239, 234, 109, 238, 14, 166, 35, 83, 239, 239, 239, 157, 157, 60, 234, 157, 239, 238, 157, 103, 239, 157, 157, 239, 83, 206, 14, 239, 71, 66, 239, 76, 14, 239, 175, 175, 182, 

In [8]:
print(top_country_list)
print([country_code[i-1][0] for i in top_country_list])

[239, 234, 238, 86, 166, 60, 157, 222, 71]
['united states', 'ukraine', 'united kingdom', 'greece', 'norway', 'czech republic', 'netherlands', 'timor-leste', 'ethiopia']


In [61]:
!vw -k -c -b 27 --oaa 10 -d project/easter1.tr -f project/easter1.model --passes 20 --holdout_after 302 -q pp -q pl -q pg -q lg #--loss_function hinge #--affix +3,-3 --loss_function logistic #hinge

creating quadratic features for pairs: pp pl pg lg 
final_regressor = project/easter1.model
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = project/easter1.tr.cache
Reading datafile = project/easter1.tr
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0       10        1       59
1.000000 1.000000            2            2.0        2       10       25
1.000000 1.000000            4            4.0        1       10       43
0.875000 0.750000            8            8.0        7       10       42
0.875000 0.875000           16           16.0       10        3       37
0.718750 0.562500           32           32.0        2        1       49
0.609375 0.500000           64           64.0        6        1       52
0.546875 0.484375          128          128.0        1   

In [62]:
!vw -t -i project/easter1.model -d project/easter1.te

creating quadratic features for pairs: pp pl pg lg 
only testing
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = project/easter1.te
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        3        3       24
0.500000 1.000000            2            2.0        4        1       61
0.250000 0.000000            4            4.0        1        1       53
0.250000 0.250000            8            8.0       10        1       66
0.312500 0.375000           16           16.0        9        9       28
0.312500 0.312500           32           32.0        2        2       41
0.281250 0.250000           64           64.0        8        8       50
0.312500 0.343750          128          128.0       10        1       46
0.281250 0.250000          256          256.0   

# -b 24 --affix +6 --spelling _ --ngram 2 --loss_function logistic --loss_function hinge -q wl -q ll -q ww --nn 10 --inpass --dropout

In [65]:
!vw -k -c -b 28 --oaa 10 -d project/easter_sample.tr -f project/easter_sample2.model --ngram 2 --passes 20 --loss_function hinge --holdout_after 240 -q pl #--nn 8 --inpass --dropout #-q ww

Generating 2-grams for all namespaces.
creating quadratic features for pairs: pl 
final_regressor = project/easter_sample2.model
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = project/easter_sample.tr.cache
Reading datafile = project/easter_sample.tr
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        1        1       52
0.500000 1.000000            2            2.0        6        1       50
0.750000 1.000000            4            4.0       10        1       72
0.750000 0.750000            8            8.0       10        1       72
0.625000 0.500000           16           16.0        1        1       82
0.625000 0.625000           32           32.0        2        1       98
0.578125 0.531250           64           64.0        5        5       50
0.531250

In [66]:
!vw -t -i project/easter_sample2.model -d project/easter_sample.te

Generating 2-grams for all namespaces.
creating quadratic features for pairs: pl 
only testing
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = project/easter_sample.te
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.000000 0.000000            1            1.0        3        3       84
0.000000 0.000000            2            2.0       10       10      104
0.000000 0.000000            4            4.0        1        1       72
0.000000 0.000000            8            8.0        5        5       90
0.062500 0.125000           16           16.0       10       10       36
0.093750 0.125000           32           32.0        7        7      162
0.109375 0.125000           64           64.0       10       10       58
0.140625 0.171875          128          128.0        1        1       42
0.183594 0.