In [1]:
import json 
import ijson
import pandas as pd
import re, os
import random
from collections import Counter

In [2]:
country_code_df= pd.read_csv('database/country_code.csv' ,sep = ',')
country_code = country_code_df.values.tolist()

domain_df= pd.read_csv('database/domain.csv' ,sep = '.')
domain_code = domain_df.values.tolist()

flags_df= pd.read_csv('database/flags_new.csv' ,sep = ',')
flags = flags_df.values.tolist()

language_code_df= pd.read_csv('database/language_code.csv' ,sep = ':')
language_code = language_code_df.values.tolist()
    
cities_df= pd.read_csv('database/worldcities.csv' ,sep = ',')
cities = cities_df.values.tolist()

country_to_code = [row[1] for row in country_code]
country_class = { name: k+1 for k,name in enumerate(country_to_code) }#create class for each country code
print(country_code)

[['afghanistan', 'af'], ['ã…land islands', 'ax'], ['albania', 'al'], ['algeria', 'dz'], ['american samoa', 'as'], ['andorra', 'ad'], ['angola', 'ao'], ['anguilla', 'ai'], ['antarctica', 'aq'], ['antigua barbuda', 'ag'], ['argentina', 'ar'], ['armenia', 'am'], ['aruba', 'aw'], ['australia', 'au'], ['austria', 'at'], ['azerbaijan', 'az'], ['bahamas', 'bs'], ['bahrain', 'bh'], ['bangladesh', 'bd'], ['barbados', 'bb'], ['belarus', 'by'], ['belgium', 'be'], ['belize', 'bz'], ['benin', 'bj'], ['bermuda', 'bm'], ['bhutan', 'bt'], ['bolivia', 'bo'], ['bonaire', 'bq'], ['bosnia', 'ba'], ['botswana', 'bw'], ['bouvet island', 'bv'], ['brazil', 'br'], ['british indian', 'io'], ['brunei darussalam', 'bn'], ['bulgaria', 'bg'], ['burkina faso', 'bf'], ['burundi', 'bi'], ['cambodia', 'kh'], ['cameroon', 'cm'], ['canada', 'ca'], ['cape verde', 'cv'], ['cayman islands', 'ky'], ['central african', 'cf'], ['chad', 'td'], ['chile', 'cl'], ['china', 'cn'], ['christmas island', 'cx'], ['cocos', 'cc'], ['colo

In [3]:
### all the functiones needed for this project
def tokenize(s): return re.sub('([^A-Za-z0-9 ]+)', ' \\1 ', s).split()  # add space around anything not alphanum
def sanify(s): return s.replace(':',';').replace('|','/')

def get_key(dicti,val):
    for key, value in dicti.items():
         if val == value:
             return key
 
    return "There is no such Key"

def writeToVWFile(filename, examples):
    with open(filename, 'w') as h:
        for ex in examples:
            h.write(ex.strip()+'\n')
def pop_2(tokens,n=2):# 
    pop_list =[]
    for ii in tokens:
        if len(ii)<=n:
            pop_list.append(ii)
    for ii in pop_list:
        tokens.remove(ii)
    return tokens
    
def check_flag(tokens):
    labeled = False
    label = None
    #tokens = pop_2(tokens)
    for token in tokens: #check for countries with one part
        if token in flags_df.values:
            index_row = [flags.index(row) for row in flags if token in row[0]]
            if index_row:
                label_char =flags[index_row[0]][1]
                label_char =str(label_char).lower()
                label = country_class[label_char]
                labeled =True
                break
    return labeled , label

def city_country_check(place):
    is_labeled = False
    label = None
    place = pop_2(place,3)
    for token in place: #check for countries with one part
        if token in country_code_df.values :
            index_row = [country_code.index(row) for row in country_code if token in row[0]]
            label_char =country_code[index_row[0]][1]
            label_char =str(label_char).lower()
            label = country_class[label_char]
            is_labeled= True
            break
    if not is_labeled:
        for k in range(len(place)-1): #for cities like new york or mexico city
            token ='%s %s' % (place[k],place[k+1])
            token = str(token).lower()
            #print(token)
            if token in cities_df.values :
                index_row = [cities.index(row) for row in cities if token == row[0] or token == row[4]]
                if index_row:
                    label_char =cities[index_row[0]][5]
                    label_char =str(label_char).lower()
                    label = country_class[label_char]
                    is_labeled= True
                    break
    if not is_labeled:
        for token in place: #check for cities with one part
            if token in cities_df.values :
                index_row = [cities.index(row) for row in cities if token == row[0] or token == row[4]]
                if index_row:
                    label_char =cities[index_row[0]][5]
                    label_char =str(label_char).lower()
                    label = country_class[label_char]
                    is_labeled= True
                    break
    return is_labeled , label

def operation(data):
    examples = []
    labels_list = []
    for i in range(len(data)):
        label = None
        namespaces = {}
        is_labeled= False
        if data[i]["place"] != None:
            #print(data[i]["place"])
            label_char = data[i]["place"]['country_code']
            label_char =str(label_char).lower()
            namespaces['p'] = [label_char]
            label = country_class[label_char]
            is_labeled= True

        if data[i]["user"]["location"] != "":
            place= tokenize(data[i]["user"]["location"].strip())
            place=pop_2(place)
            text = []
            text += place
            #print(place)
            namespaces['l'] = text
            if not is_labeled:
                is_labeled , label =city_country_check(place)

        if data[i]["user"]["name"] != "" and not is_labeled:
            user_name= tokenize(data[i]["user"]["name"].strip())
            namespaces['n'] = user_name
            if not is_labeled:
                is_labeled , label = check_flag(user_name)
            if not is_labeled:
                is_labeled , label =city_country_check(user_name)

        if data[i]["user"]["screen_name"] != "" and not is_labeled:
            screen_name= tokenize(data[i]["user"]["screen_name"].strip())
            namespaces['s'] = screen_name
            if not is_labeled:
                is_labeled , label = check_flag(screen_name)
            if not is_labeled:
                is_labeled , label =city_country_check(screen_name)

        if data[i]["user"]["description"] != "" and not is_labeled:
            description= tokenize(data[i]["user"]["description"].strip())
            namespaces['d'] = description
            if not is_labeled:
                is_labeled , label = check_flag(description)
            if not is_labeled:
                is_labeled , label =city_country_check(description)

        if data[i]["full_text"] != "":
            full_text= tokenize(data[i]["full_text"].strip())
            full_text = pop_2(full_text , 1)
            namespaces['f'] = full_text
            if not is_labeled:
                is_labeled , label = check_flag(full_text)

        if not is_labeled:
            if 'url' in data[i]["user"]["entities"]:
                if 'display_url' in data[i]["user"]["entities"]['url']['urls'][0]:
                    if data[i]["user"]["entities"]['url']['urls'][0]['display_url'] != 'None':
                        url = data[i]["user"]["entities"]['url']['urls'][0]['display_url']
                        #print(url)
                        x = re.search("(\.[^c][^o]($|\W))", url)
                        #print(x)
                        if x!=None:
                            x = re.search("(\w\w)", x[0])
                            if x!=None:
                                if x[0] in country_code_df.values:
                                    namespaces['m'] = x[0] 
        if data[i]["lang"] != "":
            lang= tokenize(data[i]["lang"].strip())
            #print(lang)
            if lang[0] in language_code_df.values:
                index_row = [language_code.index(row) for row in language_code if lang[0] in row]
                #print(language_code[index_row[0]][0])
                namespaces['g'] = [language_code[index_row[0]][0]]
                
        if label!=None:
            labels_list.append(label)
            ex = ''
            for ns,words in namespaces.items():
                ex += ' |%s %s ' % (ns, ' '.join(map(sanify,words)))
            examples += [ex]
    return examples , labels_list

def write_files(file_name,examples):
    random.seed(9999)
    random.shuffle(examples)
    middle = int(len(examples)/2)
    train_data = examples[:middle]
    test_data = examples[middle:]
    test_file_name = file_name+".te"
    train_file_name = file_name+".tr"
    with open(test_file_name, 'w') as h:
        for ex in test_data:
                h.write(ex.strip()+'\n')
    with open(train_file_name, 'w') as m:
        for ex in train_data:
                m.write(ex.strip()+'\n')
    return

def take_top_n(labels_list,n):# find top n-1 + others
    res = Counter(labels_list)
    items = list(res.items())
    sorted_list = sorted(items,key=lambda x: (x[1],x[0]),reverse=True)
    fir = sorted_list[:n-1]
    rest = sorted_list[n-1:]
    contry_code_list = [i[0] for i in fir]
    new_country_class = { name: k+1 for k,name in enumerate(contry_code_list) }
    return contry_code_list,new_country_class

def create_new_label(labels_list,n):# to reduce the number of classe to n
    contry_top_list,top_country_class=take_top_n(labels_list,n)
    new_label_list =[]
    for i in range(len(labels_list)):
        label = labels_list[i]
        if label in contry_top_list:
            new_label = top_country_class[label]
        else: 
            new_label = n
        new_label_list.append(new_label)
    return new_label_list, contry_top_list

def add_labels(examples , labels_list):
    new_examples = []
    for i in range(len(labels_list)):
        ex = str(labels_list[i])
        ex +='%s' % examples[i]
        new_examples += [ex]
    return new_examples

def create_vw_file(json_file,n = 20):# write the file name without .json
    with open(json_file+".json","r") as f:
        data=json.load(f)

    examples1 , labels_list1 = operation(data)
    print(labels_list1, examples1)
    new_label_list , contry_top_list= create_new_label(labels_list1,n)
    #top_country_code_list = [get_key(country_class,val-1) for val in new_label_list]
    #print(top_country_code_list)
    #top_country_list=[]
    #for token in top_country_code_list:
    #        top_country_list.append([row[0] for row in country_code if token in row[1]])
    new_examples = add_labels(examples1 , new_label_list)
    write_files("project/%s" %json_file,new_examples)
    return contry_top_list

In [4]:
top_country_list=create_vw_file("ukrain_loc")
print(top_country_list)
print([country_code[i-1][0] for i in top_country_list])

[239, 234, 183, 238, 103, 166, 112, 83, 60, 110, 157, 76, 209, 175, 75, 40, 177, 228, 225]
['united states', 'ukraine', 'russia', 'united kingdom', 'india', 'norway', 'japan', 'germany', 'czech republic', 'italy', 'netherlands', 'france', 'spain', 'philippines', 'finland', 'canada', 'poland', 'turkey', 'tonga']


In [14]:
!vw -k -c -b 27 --oaa 20 -d project/ukrain_loc.tr -f project/ukrain_loc.model --passes 20 --holdout_after 1200 -q pp -q pl -q pg -q lg

creating quadratic features for pairs: pp pl pg lg 
final_regressor = project/ukrain_loc.model
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = project/ukrain_loc.tr.cache
Reading datafile = project/ukrain_loc.tr
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        3        1       32
1.000000 1.000000            2            2.0        1        3       16
0.750000 0.500000            4            4.0        7        1       44
0.750000 0.750000            8            8.0        1        1       30
0.500000 0.250000           16           16.0        1        1       36
0.468750 0.437500           32           32.0        4        1       52
0.500000 0.531250           64           64.0        3        3       62
0.492188 0.484375          128          128.0   

In [15]:
!vw -t -i project/ukrain_loc.model -d project/ukrain_loc.te

creating quadratic features for pairs: pp pl pg lg 
only testing
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = project/ukrain_loc.te
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        8       20       63
0.500000 0.000000            2            2.0        1        1       52
0.250000 0.000000            4            4.0        1        1       52
0.125000 0.000000            8            8.0        1        1       67
0.250000 0.375000           16           16.0       20        1       58
0.187500 0.125000           32           32.0        2        2       29
0.187500 0.187500           64           64.0       20       20       47
0.179688 0.171875          128          128.0        2        2       34
0.160156 0.140625          256          256.0

In [12]:
!vw -k -c -b 27 --oaa 20 -d project/ukrain_loc.tr -f project/ukrain_loc2.model --passes 20 --holdout_after 1200 -q pl --nn 8 --inpass --dropout #-q ww

creating quadratic features for pairs: pl 
final_regressor = project/ukrain_loc2.model
using dropout for neural network training
using input passthrough for neural network training
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = project/ukrain_loc.tr.cache
Reading datafile = project/ukrain_loc.tr
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        3        1       32
1.000000 1.000000            2            2.0        1        3       16
0.750000 0.500000            4            4.0        7        1       44
0.750000 0.750000            8            8.0        1        1       30
0.500000 0.250000           16           16.0        1        1       34
0.468750 0.437500           32           32.0        4        1       49
0.484375 0.500000           64     

In [13]:
!vw -t -i project/ukrain_loc2.model -d project/ukrain_loc.te

creating quadratic features for pairs: pl 
only testing
using dropout for neural network testing
using input passthrough for neural network testing
Num weight bits = 27
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = project/ukrain_loc.te
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.000000 1.000000            1            1.0        8       20       63
0.500000 0.000000            2            2.0        1        1       51
0.250000 0.000000            4            4.0        1        1       50
0.125000 0.000000            8            8.0        1        1       64
0.250000 0.375000           16           16.0       20        1       54
0.218750 0.187500           32           32.0        2        2       29
0.187500 0.156250           64           64.0       20       20       46
0.164062 0.140625          128          128.0        2