# We all deal in plain text at some level

## How much of your data is in text or is converted to a text format at some point?

### Let's generate some fake data to play with:

In [None]:
! mkdir demofiles

In [None]:
import pandas as pd
import random
import os

In [None]:
df = pd.DataFrame({
    'letter': list(map(lambda x: chr(x), range(ord('a'), ord('z')+1))),
    'number': range(1,52,2),
    'bool': [not not random.randint(0,2) for i in range(26)]
})

In [None]:
df.to_csv(os.path.join("demofiles","data1.csv"))

In [None]:
!ls demofiles

data1.csv


## SED is a purpose-built tool for editing text programmatically, it's a "stream editor"

### sed will read a file and output with changes we can specify

In [None]:
# For example, we can remove the header of this file, with the 'd' command
! sed '1d' ./demofiles/data1.csv

0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [None]:
# We can also remove the first 10 lines
! sed '1,10d' ./demofiles/data1.csv

9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [112]:
# We don't have to start from the beginning of the file
! sed '11, 15d' ./demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


### By default, sed prints everything (except deleted lines) We can print selectively '-n'

In [114]:
# What if we're interested in isolating only rows which contain "True" somewhere?
! sed -n '/True/p' ./demofiles/data1.csv

0,a,1,True
2,c,5,True
3,d,7,True
5,f,11,True
6,g,13,True
7,h,15,True
9,j,19,True
11,l,23,True
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True


In [119]:
# Print only line 17
! sed -n '17p' ./demofiles/data1.csv

15,p,31,True


In [None]:
# Print line that contains 17
! sed -n '/17/p' ./demofiles/data1.csv

8,i,17,False
17,r,35,True


In [None]:
# Print line that _starts_ with 17
! sed -n '/^17/p' ./demofiles/data1.csv

17,r,35,True


In [113]:
# Delete every line between a line that starts with 11 and starts with 15
! sed '/^11/,/^15/d' ./demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [None]:
# $ indicates the end of a line
! sed -n '/True$/p' ./demofiles/data1.csv

0,a,1,True
2,c,5,True
3,d,7,True
5,f,11,True
6,g,13,True
7,h,15,True
9,j,19,True
11,l,23,True
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True


In [120]:
# So, this won't print anything because it won't match anything
! sed -n '/18$/p' ./demofiles/data1.csv

In [None]:
# This will remove all empty lines
! sed '/^$/d' ./demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


## sed can also _replace_ values

In [126]:
! sed 's/True/1/g' ./demofiles/data1.csv

,letter,number,bool
0,a,1,1
1,b,3,False
2,c,5,1
3,d,7,1
4,e,9,False
5,f,11,1
6,g,13,1
7,h,15,1
8,i,17,False
9,j,19,1
10,k,21,False
11,l,23,1
12,m,25,False
13,n,27,1
14,o,29,1
15,p,31,1
16,q,33,1
17,r,35,1
18,s,37,1
19,t,39,False
20,u,41,1
21,v,43,1
22,w,45,1
23,x,47,1
24,y,49,1
25,z,51,False


In [None]:
# We can also "chain" transformations one after the other
! sed 's/True/1/g;s/False/0/g' ./demofiles/data1.csv

,letter,number,bool
0,a,1,1
1,b,3,0
2,c,5,1
3,d,7,1
4,e,9,0
5,f,11,1
6,g,13,1
7,h,15,1
8,i,17,0
9,j,19,1
10,k,21,0
11,l,23,1
12,m,25,0
13,n,27,1
14,o,29,1
15,p,31,1
16,q,33,1
17,r,35,1
18,s,37,1
19,t,39,0
20,u,41,1
21,v,43,1
22,w,45,1
23,x,47,1
24,y,49,1
25,z,51,0


In [128]:
# These transformations can depend on previous ones
! sed 's/True/T/g;s/False/F/g;s/T/1/g;s/F/0/g' ./demofiles/data1.csv

,letter,number,bool
0,a,1,1
1,b,3,0
2,c,5,1
3,d,7,1
4,e,9,0
5,f,11,1
6,g,13,1
7,h,15,1
8,i,17,0
9,j,19,1
10,k,21,0
11,l,23,1
12,m,25,0
13,n,27,1
14,o,29,1
15,p,31,1
16,q,33,1
17,r,35,1
18,s,37,1
19,t,39,0
20,u,41,1
21,v,43,1
22,w,45,1
23,x,47,1
24,y,49,1
25,z,51,0


In [None]:
# We can remove, then insert data
! sed '/^0/,/^10/c\[REMOVED]' ./demofiles/data1.csv

,letter,number,bool
[REMOVED]
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [129]:
# Perhaps I wish to remove all numbers
! sed '1d;s/[0-9],//g' ./demofiles/data1.csv

a,True
b,False
c,True
d,True
e,False
f,1True
g,1True
h,1True
i,1False
j,1True
1k,2False
1l,2True
1m,2False
1n,2True
1o,2True
1p,3True
1q,3True
1r,3True
1s,3True
1t,3False
2u,4True
2v,4True
2w,4True
2x,4True
2y,4True
2z,5False


In [130]:
# We use the metacharacter + to match "one or more"
! sed -E '1d;s/[0-9]+,//g' ./demofiles/data1.csv

a,True
b,False
c,True
d,True
e,False
f,True
g,True
h,True
i,False
j,True
k,False
l,True
m,False
n,True
o,True
p,True
q,True
r,True
s,True
t,False
u,True
v,True
w,True
x,True
y,True
z,False


In [131]:
# We can even match multiple values
! sed -E '1d;s/,(True|False)$//g' ./demofiles/data1.csv

0,a,1
1,b,3
2,c,5
3,d,7
4,e,9
5,f,11
6,g,13
7,h,15
8,i,17
9,j,19
10,k,21
11,l,23
12,m,25
13,n,27
14,o,29
15,p,31
16,q,33
17,r,35
18,s,37
19,t,39
20,u,41
21,v,43
22,w,45
23,x,47
24,y,49
25,z,51


### Let's say I wish to censor phone numbers in a text document\

In [132]:
!echo 500 867-5309
!echo 500 867-5309 | grep -E '[0-9][0-9][0-9] [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
!echo "(500) 867-5309" | grep -E '^\([0-9][0-9][0-9]\) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
!echo "500 867-5309" | grep -E '^\([0-9][0-9][0-9]\) [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
!echo "500 867-5309" | grep -E '^\(?[0-9][0-9][0-9]\)? [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]'
!echo "500 867-5309" | sed -E 's/^\(?[0-9][0-9][0-9]\)? [0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]/xxx xxx-xxxx/g'
!echo "500 867-5309" | sed -E 's/^\(?[0-9]*\)? [0-9]+-[0-9]+/xxx xxx-xxxx/g'

500 867-5309
500 867-5309
(500) 867-5309
500 867-5309
xxx xxx-xxxx
xxx xxx-xxxx


## sed can be used to edit files "in-place" and be called from a script

In [None]:
%%file demofiles/replace.sh

#!/bin/bash

echo Enter val to replace:
read val1
echo Enter new val:
read val2
#echo DENS: $dens
sed -i "s/$val1/$val2/g" $1

Writing demofiles/replace.sh


In [None]:
! cp demofiles/data1.csv demofiles/data2.csv

In [None]:
! bash demofiles/replace.sh demofiles/data2.csv

Enter val to replace:
18
Enter new val:
1000


In [None]:
! cat demofiles/data2.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
1000,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


## Now let's try some simple awk

In [134]:
! awk -F, '{print $0}' demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [135]:
!awk -F, '{print $0}' demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [137]:
# Awk syntax boils down to
# patterns and actions in the form:
# pattern { action; }
# and is particularly useful for dealing with columnar data

In [136]:
# Awk has special syntax for dealing with columns
!awk -F, '{print $1}' demofiles/data1.csv


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


In [None]:
# Awk can accept input from STDIN, maybe even preprocessed by sed!
! sed '1d' demofiles/data1.csv | awk -F, '{print $1}'

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25


## Awk can be read from a file, if the program is particularly long

In [138]:
%%file demofiles/true-only.awk

BEGIN{
    FS=","
}
{print $0}

Overwriting demofiles/true-only.awk


In [139]:
!awk -f demofiles/true-only.awk demofiles/data1.csv

,letter,number,bool
0,a,1,True
1,b,3,False
2,c,5,True
3,d,7,True
4,e,9,False
5,f,11,True
6,g,13,True
7,h,15,True
8,i,17,False
9,j,19,True
10,k,21,False
11,l,23,True
12,m,25,False
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
19,t,39,False
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True
25,z,51,False


In [140]:
%%file demofiles/true-only.awk

BEGIN{
    FS=","
}
$4 == "True" {print $0}


Overwriting demofiles/true-only.awk


In [141]:
!awk -f demofiles/true-only.awk demofiles/data1.csv

0,a,1,True
2,c,5,True
3,d,7,True
5,f,11,True
6,g,13,True
7,h,15,True
9,j,19,True
11,l,23,True
13,n,27,True
14,o,29,True
15,p,31,True
16,q,33,True
17,r,35,True
18,s,37,True
20,u,41,True
21,v,43,True
22,w,45,True
23,x,47,True
24,y,49,True


In [142]:
%%file demofiles/true-only.awk

BEGIN{
    FS=","
}
$4 == "True" {print $2}


Overwriting demofiles/true-only.awk


In [143]:
!awk -f demofiles/true-only.awk demofiles/data1.csv

a
c
d
f
g
h
j
l
n
o
p
q
r
s
u
v
w
x
y


In [144]:
%%file demofiles/sum-true.awk

BEGIN{
    FS=","

}
$4 == "True" {
    sum += $3
}
END {
    print sum
}


Overwriting demofiles/sum-true.awk


In [145]:
!awk -f demofiles/sum-true.awk demofiles/data1.csv

511


In [146]:
%%file demofiles/sum-true.awk

BEGIN{
    FS=","

}
/True/ {
    sum += $3
}
END {
    print sum
}


Overwriting demofiles/sum-true.awk


In [147]:
!awk -f demofiles/sum-true.awk demofiles/data1.csv

511


In [110]:
! wget https://arxiv.org/stats/get_monthly_submissions
! mv get_monthly_submissions demofiles

--2025-04-11 13:11:21--  https://arxiv.org/stats/get_monthly_submissions
Resolving arxiv.org (arxiv.org)... 151.101.131.42, 151.101.67.42, 151.101.195.42, ...
Connecting to arxiv.org (arxiv.org)|151.101.131.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6389 (6.2K) [text/csv]
Saving to: ‘get_monthly_submissions’


2025-04-11 13:11:21 (66.1 MB/s) - ‘get_monthly_submissions’ saved [6389/6389]



In [111]:
! head ./demofiles/get_monthly_submissions

month,submissions,historical_delta
1991-07,2,-2
1991-08,28,-1
1991-09,58,0
1991-10,76,0
1991-11,64,0
1991-12,78,0
1992-01,193,-105
1992-02,134,-10
1992-03,120,-3


In [None]:
! sed 's/\-[0-9]*//g' ./demofiles/get_monthly_submissions

month,submissions,historical_delta
1991,2,
1991,28,
1991,58,0
1991,76,0
1991,64,0
1991,78,0
1992,193,
1992,134,
1992,120,
1992,225,
1992,237,
1992,237,
1992,296,
1992,227,
1992,336,
1992,401,
1992,453,
1992,404,
1993,400,
1993,428,
1993,504,
1993,505,
1993,538,
1993,536,
1993,643,
1993,541,
1993,518,
1993,678,
1993,714,
1993,738,
1994,610,
1994,653,
1994,753,
1994,738,
1994,841,
1994,888,
1994,863,
1994,766,
1994,884,
1994,932,
1994,1114,
1994,1055,
1995,933,
1995,977,
1995,1168,
1995,913,
1995,1110,
1995,1195,
1995,1017,
1995,1030,
1995,1133,
1995,1229,
1995,1162,
1995,1147,
1996,1051,
1996,1076,
1996,1169,
1996,1220,
1996,1358,
1996,1311,
1996,1424,
1996,1394,
1996,1474,
1996,1520,
1996,1402,
1996,1467,
1997,1308,
1997,1362,
1997,1411,
1997,1468,
1997,1581,
1997,1709,
1997,1804,
1997,1426,
1997,1882,
1997,2007,
1997,1717,
1997,1949,
1998,1723,
1998,1668,0
1998,1911,
1998,1747,
1998,1918,
1998,2089,
1998,2101,
1998,1826,
1998,2414,
1998,2330,
1998,2242,
1998,2203,
1999,1850,
1999,1919

In [None]:
! sed 's/\-[0-9]*//g' ./demofiles/get_monthly_submissions | awk -F"," '{a[$1]+=$2} END {for (i in a) print i FS a[i]}' | sort


1991,306
1992,3263
1993,6743
1994,10097
1995,13014
1996,15866
1997,19624
1998,24172
1999,27704
2000,30601
2001,33214
2002,36121
2003,39414
2004,43727
2005,46855
2006,50227
2007,55638
2008,58915
2009,64047
2010,70131
2011,76578
2012,84603
2013,92641
2014,97517
2015,105280
2016,113380
2017,123523
2018,140616
2019,155866
2020,178329
2021,181630
2022,185692
2023,208493
2024,244031
2025,73085
month,0


In [153]:
import random
import string

def generate_random_text(length):
    letters = string.ascii_letters + string.digits + string.punctuation + '\n'
    return ''.join(random.choice(letters) for i in range(length))

def write_random_text_to_file(filename, length):
    random_text = generate_random_text(length)
    with open(filename, 'w') as file:
        file.write(random_text)

# Example usage
filename = 'random_text.txt'
length = 10000  # Length of the random text
write_random_text_to_file(os.path.join("demofiles",filename), length)

In [157]:
!head demofiles/random_text.txt
!wc -l demofiles/random_text.txt

KwhLV>?.g$,RL1>Ay+O(.
]xie30Z*C2^<oZ"7w381=iM)p9M\E+Ua(.1o!N=^n=qizy^7A[o)K]^{y?4V21Ot#c_|hCPM%z%EF;UTi^Z.Svpq/B
:nv|t&}LN-Yqp`so]7^)jl!`6TyySPy2r_c/%`!aBfL:Uy'-\ShjEY\F8*YlV~>7JY}Aqyb?'&?''~J%cMyz;73/,c~g6:J<jAc!#;~4{6<C_ZESu(QYx=ph[$2<2B;;Kp2CE8vmk)._8PfjHcJ$>7u$d{]6Kj<HST=&`P6u-}-aeF=aq$k:f\/U5x>+$rMa2S}+J<z!eGGnR9B~P%^{p&,|?$$IRHk:*=lU]Vqui
Mi'~w.^sts`V8gr"4Bn=z-4nGe3S|p<$#cxnF@/1N89(-\bJJPd
S..E@/[M0B)5-od>f=A^$a;pb'm|52fYJ/>?:VdHi
2x<Dy|.$DTs-l"p'HvGOKVDy=$~v1/sDVM
umLt)/$*C'C><VMt/A!%XG_zq;=B((!E0CEA4vEx`qIAl+~)zWl6u-yd79
*WnS'D+Us`=b/
eAKZ5axw;'I;/FlD#7zM,3/{]C[S.k]h1
CXi3q:q<IOFMlL.Co#9[Uh^iiC@DG~xQU1/8~.%|M.]hk6cfprWW>bv^
108 demofiles/random_text.txt


In [159]:
# Perhaps we wish to take a sample of a very large file?
!awk 'BEGIN {srand()} !/^$/ { if (rand() <= .1) print $0}' demofiles/random_text.txt #should print about 10 lines
# This takes linear time

CXi3q:q<IOFMlL.Co#9[Uh^iiC@DG~xQU1/8~.%|M.]hk6cfprWW>bv^
AGR]6(=;_(7U.U]-sboZ1]8{"vz_*-fLaZj,DShP
VgEv!]J@hM;^<E~\C?^'&TT33RDZQ[fav[,hPISFX|(GR<4Ea-!d7-.pPrUEgf'(E?BfvC:R(n?OQ]qs
q$^k(ydY(`$Lfa?k^OFm?(pFq=k.OCwmkI{fMN_%BXAMYaD|ue,QnCw&Ow:G0._%O.$u'lFDNFx!uw7r8Ny*5._5w094(t*T#"'SKGt}:[gV,_:g%jFbupN^ti|J"u'nx4NsVtB(Jl@BBOj>1>bcnw|SN!}<A%*r>VloIN}]`LwmvTD98Z"pUCdzw|a3Ez9%;}T29Xz:V~!J~|FExU9Zt=OJa4sC8:&s>:s1HJ\8i2bQ.}Z#1C?[KfnR_-;pYS7HhEwN#C2^PUpRq-V+S$,,3pDS
Y_?)r^tLvEtN%@D.2hHeX7/Q"jIWPp.igteCw5e>R`xfazr<H`6@)fz!EVHo+D+i9~+e55_/<BxplwJWE4epp\qbDvL+W30s,,6HHl?>.M?b.3d-[g8ZXKDb
$.w(M;3}0828xhXp"BihbL"rwUg-~D.+J>Ewmj&ytzr4y$^\yEg1XXh,A)8K?$*w<xTflD4!;c$-5V^k%<Kkl&npSmCqCM/
WMPp_m%z=1Nm/hN6@^/QYoXzW<Gj.&JfJk8Gb_Q*rg{:}>v'[0(GHTwESg4D^/@UI%SoI&k=q[[pS`%JtVHv0|he[JU
}qA8>k*n#({b.n~k
\,C.pJ(d!*\g4@-a{d~P{X<j6FtnfqUID{MYgxq0l%VTCdikQexG#g)q~d64r~%x|i2<5"6kyxSaT[L^V=klD%T.7]F.Qt|e{z'~JMM@&o&Lftkt:xd}~kT;4h!T)~JF(h8/'Xm3'IExfC>aH1/$>q{fV3J*8Mj#GF5m_J2?*hM/QL|'G|KR1yFR7'8F7L[bc{|i3V838@X?.?Pg1B.,uq