-
Notifications
You must be signed in to change notification settings - Fork 1
/
cls.py
153 lines (138 loc) · 6.31 KB
/
cls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# classify.py
# Language classification ported to Python
# (c) Alex King, 12/19/2014
from __future__ import division # For non-integer division
import sys # For file IO
import os # For directory operations
import math # For sqrt and bit vector calculation
import urllib # For web page functionality
import shutil # For copying input files and adding to library
############################## MODEL GENERATION ################################
# str_to_trigrams : string -> list
# creates python list of trigrams from string
def str_to_trigrams(string):
list = []
for x in range(0, len(string) - 2): # For every group of three letters
list.append(string[x] + string[x + 1] + string[x + 2])
return list
# add_list_to_dict : list, dictionary -> dictionary
# Adds specified list of trigrams to occurrence dictionary
def add_list_to_dict(list, dict):
for x in list: # For each trigram in the list, add it or increment count
if dict.has_key(x):
dict[x] += 1
else:
dict[x] = 1
return dict
# single_lang_model : string -> tuple (langname, dictionary)
# creates tuple with specified language name and model (if langname exists)
def single_lang_model(mode, langname):
dict = {}
path = "./mode-" + mode + "/" + langname + "/" # construct folder path
for file in os.listdir(path):
fullpath = path + file
trigrams = str_to_trigrams((open(fullpath)).read())
add_list_to_dict(trigrams, dict)
tuple = (langname, dict);
return tuple
# build_all_models : string -> list_of_tuples [(langname, dictionary)]
# returns list of all model tuples containing language name and model
# based on mode inputted, lang or subject
def build_all_models(mode):
model_list = []
dir = "./mode-" + mode + "/"
for lang in os.listdir(dir):
model_list.append(single_lang_model(mode, lang))
return model_list
# make_file_model : filename -> dictionary
# makes trigram occurrence model from specified file (if it exists)
def make_file_model(filename):
if (filename.startswith("http://")):
sock = urllib.urlopen(filename)
trigrams = str_to_trigrams((sock.read()))
sock.close()
else:
trigrams = str_to_trigrams((open(filename)).read())
dict = add_list_to_dict(trigrams, {})
return dict
# make_mode_list : -> list
# returns list of all modes listed in working directory
def make_mode_list():
mode_list = []
for mode in os.listdir("."):
if mode.startswith("mode-"):
mode_list.append(mode[5:])
return mode_list
############################ CLASSIFICATION ####################################
# nearest_model : dictionary list_of_tuples -> string
# returns name of language with highest similarity score given file and models
def nearest_model(file_model, model_list):
score_list = [] # empty score list to start
for model in model_list: # Compute each similarity score as list
score_list.append(bit_vector_sim(file_model, model))
# Verbose output -- uncomment to see scores of each language
# for score in score_list:
# sys.stdout.write(model_list[score_list.index(score)][0])
# sys.stdout.write(": ")
# print score
# find the index of the highest score, use to index the model list
return model_list[score_list.index(max(score_list))][0]
# bit_vector_sim : dictionary tuple -> number
# returns bit vector similarity score given two dictionaries; scores range 0-1
def bit_vector_sim(model1, model2):
count = 0
for key in model1.keys(): # map over every key
if model2[1].has_key(key):
count += 1
return count / ((math.sqrt(len(model1))) * (math.sqrt(len(model2[1]))))
# Adds specified file to correct type to help bolster training library
def add_to_library(mode, input):
if (input.startswith("http://")):
sock = urllib.urlopen(input)
htmltext = sock.read()
htmlname = "saved/" + input.split("//", 1)[1] + ".txt"
htmlfile = open(htmlname, "w")
htmlfile.write(htmltext)
htmlfile.close()
sock.close()
path = "./mode-" + mode + "/"
print "Please enter the number of the correct type."
type_list = os.listdir(path)
for type in type_list:
print str(type_list.index(type)) + " " + type
ans = int(raw_input())
if ans >= 0 and ans < len(type_list):
if input.startswith("http://"):
name = htmlname.rsplit("/", 1)[1]
input = htmlname
else:
name = input.rsplit("/", 1)[1]
dest = path + type_list[ans] + "/" + name
print "Ready to copy. input = " + input + " dest: " + dest
shutil.copyfile(input, dest)
print ("The file has been copied to '" + type_list[ans] +
"' with the name '" + name + "'!")
# main
def main():
if len(sys.argv) != 3:
sys.stdout.write("Usage: python classify.py [--mode]")
print " [file.txt|http://webpage.com]"
sys.exit(1)
mode_list = make_mode_list() # make list of working modes
mode = (str(sys.argv[1]))[2:]
if not (mode in mode_list):
print "Unrecognized option. Choose from:"
for modes in mode_list:
sys.stdout.write("--")
print modes
sys.exit(1)
file_model = make_file_model(str(sys.argv[2])) # Make model for file
model_list = build_all_models(mode) # Make all language models
print nearest_model(file_model, model_list) # print language name
# ans1 = raw_input("Is this classification correct? [y/n] ")
# if (ans1 == "n"):
# ans2 = raw_input("May your file be copied into" +
# " the training library? [y/n] ")
# if ans2 == "y":
# add_to_library(mode, str(sys.argv[2]))
main() # run main