Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100755 128 lines (108 sloc) 4.016 kb
d8becf5 Alex Ksikes Slight changes on the README and #! /usr/bin/env python
authored
1 #! /usr/bin/env python
1430ea2 Alex Ksikes First commit
authored
2 # Author: Alex Ksikes (alex.ksikes@gmail.com)
3
4 import extract
5 import MySQLdb
6 import os
7 import re
8
9 def get_cursor_from_env():
10 user, db = os.getenv('USER'), os.getenv('DB')
11 passwd = raw_input('Password (%s@localhost for db "%s"): ' % (user, db))
12 db = MySQLdb.connect(host='localhost', user=user, passwd=passwd, db=db)
13 return db.cursor()
14
15 def create_table(c, config, tbl_name, columns=[], drop=False):
16 if drop:
17 sql = 'drop table if exists %s' % tbl_name
18 print sql
19 c.execute(sql)
20 sql = 'create table %s (\n' % tbl_name
21
22 if config.has_key('sql'):
23 sql += '\t%s,\n' % config['sql']
24
25 for f in config['fields']:
26 if columns and f not in columns:
27 continue
28 sql += '\t%s\t%s,\n' % (f, config[f]['sql'])
29 sql = sql[:-2] + '\n) charset utf8'
30 print sql
31 c.execute(sql)
32
33 def populate(c, tab_file, tbl_name, columns=[], duplicates='ignore', ignore_lines=1):
34 if columns:
35 columns = '(%s)' % ', '.join(columns)
36 else:
37 columns = ''
38 if not ignore_lines:
39 ignore_lines = ''
40 else:
41 ignore_lines = 'ignore %s lines' % ignore_lines
42
43 sql = 'load data local infile "%s" %s into table %s %s %s' % \
44 (tab_file, duplicates, tbl_name, ignore_lines, columns)
45 print sql
46 c.execute(sql)
47
48 def populate_split(c, tab_file, tbl_name, columns=[], num_lines=1000000, duplicates='ignore', ignore_lines=1):
49 for i, f in enumerate(get_split_file(tab_file, num_lines)):
50 if i == 0:
51 ignore_lines = 1
52 else:
53 ignore_lines = 0
54 populate(f, tbl_name, columns, duplicates, ignore_lines)
55
56 def ls_split_file(dir):
57 for f in sorted(os.listdir(dir)):
58 if re.match('x\d\d', f):
59 yield os.path.join(dir, f)
60
61 def get_split_file(path, num_lines):
62 print 'Splitting the the tab files ...'
63 cwd = os.path.dirname(path)
64 os.chdir(cwd)
65 os.system('split -dl %s %s' % (num_lines, path))
66
67 for f in ls_split_file(cwd):
68 yield f
69 os.system('rm %s' % f)
70
71 def read_config_file(config_file):
72 config = extract.parse_config(config_file, compile_regex=False)
73 config['fields'].insert(0, 'filename')
74 config['filename'] = {'sql' : 'varchar(32)'}
75 return config
76
77 def run(config_file, tab_file, tbl_name, columns=[], drop=False, split=False):
78 config = read_config_file(config_file)
79 cursor = get_cursor_from_env()
80
81 create_table(cursor, config, tbl_name, columns, drop)
82 if split:
83 populate_split(cursor, tab_file, tbl_name, columns, num_lines=split)
84 else:
85 populate(cursor, tab_file, tbl_name, columns)
86
87 def usage():
88 print 'Usage:'
89 print ' python populate.py config_file tab_file tbl_name'
90 print
91 print 'Description:'
92 print ' Read the config file and the tab file'
93 print ' and load the data into a mysql table.'
94 print
95 print 'Options:'
96 print ' -d, --drop : drop table if it exists'
97 print ' -c, --columns : insert these columns only'
98 print ' -s, --split <num> : split the tab file into chunks of num rows each'
99 print
100 print 'Email bugs/suggestions to Alex Ksikes (alex.ksikes@gmail.com)'
101
102 import sys, getopt
103 def main():
104 try:
105 opts, args = getopt.getopt(sys.argv[1:], 'c:ds:h',
106 ['columns=', 'drop', 'split=', 'help'])
107 except getopt.GetoptError:
108 usage(); sys.exit(2)
109
110 columns, drop, split = [], False, False
111 for o, a in opts:
112 if o in ('-c', '--columns'):
113 columns = a.split()
114 elif o in ('-d', '--drop'):
115 drop = True
116 elif o in ('-s', '--split'):
117 split = int(a)
118 elif o in ('-h', '--help'):
119 usage(); sys.exit()
120
121 if len(args) < 2:
122 usage()
123 else:
124 run(args[0], args[1], args[2], columns, drop, split)
125
126 if __name__ == '__main__':
127 main()
Something went wrong with that request. Please try again.